summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/arch/BUILD4
-rw-r--r--pkg/sentry/arch/arch.go7
-rw-r--r--pkg/sentry/arch/arch_amd64.go12
-rw-r--r--pkg/sentry/arch/arch_arm64.go12
-rw-r--r--pkg/sentry/arch/signal_act.go2
-rw-r--r--pkg/sentry/arch/signal_amd64.go27
-rw-r--r--pkg/sentry/arch/signal_arm64.go30
-rw-r--r--pkg/sentry/arch/signal_stack.go2
-rw-r--r--pkg/sentry/arch/stack.go179
-rw-r--r--pkg/sentry/arch/stack_unsafe.go69
-rw-r--r--pkg/sentry/control/BUILD1
-rw-r--r--pkg/sentry/control/proc.go28
-rw-r--r--pkg/sentry/devices/memdev/BUILD5
-rw-r--r--pkg/sentry/devices/memdev/full.go4
-rw-r--r--pkg/sentry/devices/memdev/null.go4
-rw-r--r--pkg/sentry/devices/memdev/random.go4
-rw-r--r--pkg/sentry/devices/memdev/zero.go28
-rw-r--r--pkg/sentry/devices/ttydev/ttydev.go2
-rw-r--r--pkg/sentry/fdimport/BUILD1
-rw-r--r--pkg/sentry/fdimport/fdimport.go22
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md99
-rw-r--r--pkg/sentry/fs/host/BUILD2
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fs/host/tty.go4
-rw-r--r--pkg/sentry/fs/inode.go2
-rw-r--r--pkg/sentry/fs/inode_overlay.go11
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/sys_net.go120
-rw-r--r--pkg/sentry/fs/proc/sys_net_state.go15
-rw-r--r--pkg/sentry/fs/proc/sys_net_test.go73
-rw-r--r--pkg/sentry/fs/proc/task.go46
-rw-r--r--pkg/sentry/fs/tty/BUILD3
-rw-r--r--pkg/sentry/fs/tty/dir.go46
-rw-r--r--pkg/sentry/fs/tty/fs.go4
-rw-r--r--pkg/sentry/fs/tty/line_discipline.go55
-rw-r--r--pkg/sentry/fs/tty/master.go37
-rw-r--r--pkg/sentry/fs/tty/queue.go14
-rw-r--r--pkg/sentry/fs/tty/replica.go (renamed from pkg/sentry/fs/tty/slave.go)88
-rw-r--r--pkg/sentry/fs/tty/terminal.go39
-rw-r--r--pkg/sentry/fs/tty/tty_test.go4
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD4
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go52
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts_test.go4
-rw-r--r--pkg/sentry/fsimpl/devpts/line_discipline.go55
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go47
-rw-r--r--pkg/sentry/fsimpl/devpts/queue.go14
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go204
-rw-r--r--pkg/sentry/fsimpl/devpts/slave.go198
-rw-r--r--pkg/sentry/fsimpl/devpts/terminal.go37
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go6
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go2
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go10
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD4
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_file.go32
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_test.go46
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go2
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go11
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/BUILD3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_32.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_64.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent.go3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_new.go4
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_old.go4
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/disklayout.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent.go12
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent_test.go9
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode.go3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_new.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_old.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_32.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_64.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_old.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_test.go9
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/test_utils.go6
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go2
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go7
-rw-r--r--pkg/sentry/fsimpl/ext/extent_test.go19
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go22
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go2
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go6
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go6
-rw-r--r--pkg/sentry/fsimpl/ext/utils.go8
-rw-r--r--pkg/sentry/fsimpl/fuse/BUILD18
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go365
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_control.go (renamed from pkg/sentry/fsimpl/fuse/init.go)171
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_test.go117
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go208
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/file.go133
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go573
-rw-r--r--pkg/sentry/fsimpl/fuse/read_write.go242
-rw-r--r--pkg/sentry/fsimpl/fuse/regular_file.go230
-rw-r--r--pkg/sentry/fsimpl/fuse/request_response.go229
-rw-r--r--pkg/sentry/fsimpl/fuse/utils_test.go132
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go48
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go181
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go2
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go34
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go4
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go31
-rw-r--r--pkg/sentry/fsimpl/host/BUILD2
-rw-r--r--pkg/sentry/fsimpl/host/host.go89
-rw-r--r--pkg/sentry/fsimpl/host/mmap.go6
-rw-r--r--pkg/sentry/fsimpl/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fsimpl/host/tty.go14
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD1
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go10
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go15
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go276
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go117
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go98
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go30
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go6
-rw-r--r--pkg/sentry/fsimpl/kernfs/synthetic_directory.go102
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go166
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go14
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go452
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go118
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go176
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go8
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD1
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go7
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go23
-rw-r--r--pkg/sentry/fsimpl/proc/task.go17
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go50
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go138
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go7
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go27
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go24
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go69
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys_test.go71
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go3
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go10
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go11
-rw-r--r--pkg/sentry/fsimpl/sys/kcov.go7
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go13
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go2
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go8
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/device_file.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go50
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go80
-rw-r--r--pkg/sentry/fsimpl/tmpfs/socket_file.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go143
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go2
-rw-r--r--pkg/sentry/fsimpl/verity/BUILD25
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go264
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go291
-rw-r--r--pkg/sentry/fsimpl/verity/verity_test.go349
-rw-r--r--pkg/sentry/inet/BUILD1
-rw-r--r--pkg/sentry/inet/inet.go11
-rw-r--r--pkg/sentry/inet/test_stack.go17
-rw-r--r--pkg/sentry/kernel/BUILD8
-rw-r--r--pkg/sentry/kernel/auth/BUILD1
-rw-r--r--pkg/sentry/kernel/auth/id.go4
-rw-r--r--pkg/sentry/kernel/fd_table.go162
-rw-r--r--pkg/sentry/kernel/fd_table_test.go8
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go61
-rw-r--r--pkg/sentry/kernel/kcov.go4
-rw-r--r--pkg/sentry/kernel/kernel.go103
-rw-r--r--pkg/sentry/kernel/pipe/BUILD1
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go3
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go12
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go5
-rw-r--r--pkg/sentry/kernel/ptrace.go23
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go2
-rw-r--r--pkg/sentry/kernel/syscalls.go10
-rw-r--r--pkg/sentry/kernel/task_clone.go6
-rw-r--r--pkg/sentry/kernel/task_context.go6
-rw-r--r--pkg/sentry/kernel/task_exit.go3
-rw-r--r--pkg/sentry/kernel/task_futex.go7
-rw-r--r--pkg/sentry/kernel/task_run.go2
-rw-r--r--pkg/sentry/kernel/task_signals.go6
-rw-r--r--pkg/sentry/kernel/task_syscall.go7
-rw-r--r--pkg/sentry/kernel/task_usermem.go43
-rw-r--r--pkg/sentry/kernel/threads.go2
-rw-r--r--pkg/sentry/loader/loader.go10
-rw-r--r--pkg/sentry/mm/mm.go2
-rw-r--r--pkg/sentry/mm/mm_test.go3
-rw-r--r--pkg/sentry/mm/special_mappable.go9
-rw-r--r--pkg/sentry/mm/syscalls.go13
-rw-r--r--pkg/sentry/platform/kvm/BUILD10
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.s7
-rw-r--r--pkg/sentry/platform/kvm/kvm.go10
-rw-r--r--pkg/sentry/platform/kvm/machine.go21
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go41
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go13
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go4
-rw-r--r--pkg/sentry/platform/ptrace/BUILD1
-rw-r--r--pkg/sentry/platform/ptrace/filters.go9
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go5
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go10
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go61
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go39
-rw-r--r--pkg/sentry/platform/ring0/defs_arm64.go3
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.go7
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.s193
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s52
-rw-r--r--pkg/sentry/platform/ring0/gen_offsets/BUILD5
-rw-r--r--pkg/sentry/platform/ring0/kernel.go22
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go64
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go9
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.go12
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.s47
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.go9
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.s14
-rw-r--r--pkg/sentry/platform/ring0/offsets_amd64.go11
-rw-r--r--pkg/sentry/platform/ring0/offsets_arm64.go1
-rw-r--r--pkg/sentry/platform/ring0/x86.go2
-rw-r--r--pkg/sentry/socket/BUILD2
-rw-r--r--pkg/sentry/socket/hostinet/BUILD7
-rw-r--r--pkg/sentry/socket/hostinet/socket.go4
-rw-r--r--pkg/sentry/socket/hostinet/socket_vfs2.go13
-rw-r--r--pkg/sentry/socket/hostinet/stack.go30
-rw-r--r--pkg/sentry/socket/netfilter/extensions.go72
-rw-r--r--pkg/sentry/socket/netfilter/ipv4.go23
-rw-r--r--pkg/sentry/socket/netfilter/ipv6.go23
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go26
-rw-r--r--pkg/sentry/socket/netfilter/targets.go472
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go32
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go32
-rw-r--r--pkg/sentry/socket/netlink/BUILD4
-rw-r--r--pkg/sentry/socket/netlink/provider_vfs2.go1
-rw-r--r--pkg/sentry/socket/netlink/socket.go4
-rw-r--r--pkg/sentry/socket/netlink/socket_vfs2.go7
-rw-r--r--pkg/sentry/socket/netstack/BUILD4
-rw-r--r--pkg/sentry/socket/netstack/netstack.go95
-rw-r--r--pkg/sentry/socket/netstack/netstack_vfs2.go27
-rw-r--r--pkg/sentry/socket/netstack/stack.go43
-rw-r--r--pkg/sentry/socket/socket.go2
-rw-r--r--pkg/sentry/socket/unix/BUILD18
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go14
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go4
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go23
-rw-r--r--pkg/sentry/socket/unix/unix.go60
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go47
-rw-r--r--pkg/sentry/state/state.go6
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/epoll.go4
-rw-r--r--pkg/sentry/strace/socket.go23
-rw-r--r--pkg/sentry/strace/strace.go37
-rw-r--r--pkg/sentry/syscalls/linux/BUILD5
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go33
-rw-r--r--pkg/sentry/syscalls/linux/sys_capability.go16
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go38
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go14
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go31
-rw-r--r--pkg/sentry/syscalls/linux/sys_identity.go16
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go5
-rw-r--r--pkg/sentry/syscalls/linux/sys_rlimit.go19
-rw-r--r--pkg/sentry/syscalls/linux/sys_rusage.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go67
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_sysinfo.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go15
-rw-r--r--pkg/sentry/syscalls/linux/sys_time.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_timer.go101
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_tls_amd64.go13
-rw-r--r--pkg/sentry/syscalls/linux/sys_utsname.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD5
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/aio.go27
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/epoll.go49
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go10
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/ioctl.go13
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mmap.go12
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/pipe.go5
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/poll.go24
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/setstat.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go67
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go106
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/timerfd.go6
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go16
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/xattr.go32
-rw-r--r--pkg/sentry/vfs/anonfs.go21
-rw-r--r--pkg/sentry/vfs/dentry.go2
-rw-r--r--pkg/sentry/vfs/device.go3
-rw-r--r--pkg/sentry/vfs/epoll.go25
-rw-r--r--pkg/sentry/vfs/file_description.go63
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go52
-rw-r--r--pkg/sentry/vfs/filesystem.go30
-rw-r--r--pkg/sentry/vfs/filesystem_type.go9
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go2
-rw-r--r--pkg/sentry/vfs/inotify.go2
-rw-r--r--pkg/sentry/vfs/lock.go2
-rw-r--r--pkg/sentry/vfs/memxattr/xattr.go16
-rw-r--r--pkg/sentry/vfs/mount.go85
-rw-r--r--pkg/sentry/vfs/mount_test.go26
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go11
-rw-r--r--pkg/sentry/vfs/options.go52
-rw-r--r--pkg/sentry/vfs/permissions.go40
-rw-r--r--pkg/sentry/vfs/resolving_path.go5
-rw-r--r--pkg/sentry/vfs/vfs.go66
317 files changed, 8606 insertions, 3530 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 901e0f320..4af4d6e84 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -22,6 +22,7 @@ go_library(
"signal_info.go",
"signal_stack.go",
"stack.go",
+ "stack_unsafe.go",
"syscalls_amd64.go",
"syscalls_arm64.go",
],
@@ -33,11 +34,12 @@ go_library(
"//pkg/context",
"//pkg/cpuid",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/limits",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index a903d031c..d75d665ae 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -72,12 +73,12 @@ type Context interface {
// with return values of varying sizes (for example ARCH_GETFS). This
// is a simple utility function to convert to the native size in these
// cases, and then we can CopyOut.
- Native(val uintptr) interface{}
+ Native(val uintptr) marshal.Marshallable
// Value converts a native type back to a generic value.
// Once a value has been converted to native via the above call -- it
// can be converted back here.
- Value(val interface{}) uintptr
+ Value(val marshal.Marshallable) uintptr
// Width returns the number of bytes for a native value.
Width() uint
@@ -205,7 +206,7 @@ type Context interface {
// equivalent of arch_ptrace():
// PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
- PtracePeekUser(addr uintptr) (interface{}, error)
+ PtracePeekUser(addr uintptr) (marshal.Marshallable, error)
// PtracePokeUser implements ptrace(PTRACE_POKEUSR).
PtracePokeUser(addr, data uintptr) error
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 1c3e3c14c..c7d3a206d 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -23,6 +23,8 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -179,14 +181,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
}
// Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
- v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+ v := primitive.Uint64(val)
return &v
}
// Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
- return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+ return uintptr(*val.(*primitive.Uint64))
}
// Width returns the byte width of this architecture.
@@ -293,7 +295,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
const userStructSize = 928
// PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
if addr&7 != 0 || addr >= userStructSize {
return nil, syscall.EIO
}
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 550741d8c..680d23a9f 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -22,6 +22,8 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -163,14 +165,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
}
// Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
- v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+ v := primitive.Uint64(val)
return &v
}
// Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
- return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+ return uintptr(*val.(*primitive.Uint64))
}
// Width returns the byte width of this architecture.
@@ -274,7 +276,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
}
// PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
return c.Native(0), nil
}
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 32173aa20..d3e2324a8 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,7 +14,7 @@
package arch
-import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+import "gvisor.dev/gvisor/pkg/marshal"
// Special values for SignalAct.Handler.
const (
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 6fb756f0e..72e07a988 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -17,17 +17,19 @@
package arch
import (
- "encoding/binary"
"math"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/usermem"
)
// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
+//
+// +marshal
type SignalContext64 struct {
R8 uint64
R9 uint64
@@ -68,6 +70,8 @@ const (
)
// UContext64 is equivalent to ucontext_t on 64-bit x86.
+//
+// +marshal
type UContext64 struct {
Flags uint64
Link uint64
@@ -172,12 +176,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
// "... the value (%rsp+8) is always a multiple of 16 (...) when
// control is transferred to the function entry point." - AMD64 ABI
- ucSize := binary.Size(uc)
- if ucSize < 0 {
- // This can only happen if we've screwed up the definition of
- // UContext64.
- panic("can't get size of UContext64")
- }
+ ucSize := uc.SizeBytes()
// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
frameSize := int(st.Arch.Width()) + ucSize + 128
frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
@@ -195,18 +194,18 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
info.FixSignalCodeForUser()
// Set up the stack frame.
- infoAddr, err := st.Push(info)
- if err != nil {
+ if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
return err
}
- ucAddr, err := st.Push(uc)
- if err != nil {
+ infoAddr := st.Bottom
+ if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
return err
}
+ ucAddr := st.Bottom
if act.HasRestorer() {
// Push the restorer return address.
// Note that this doesn't need to be popped.
- if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil {
+ if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil {
return err
}
} else {
@@ -240,11 +239,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
// Copy out the stack frame.
var uc UContext64
- if _, err := st.Pop(&uc); err != nil {
+ if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
var info SignalInfo
- if _, err := st.Pop(&info); err != nil {
+ if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 642c79dda..7fde5d34e 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build arm64
+
package arch
import (
- "encoding/binary"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +26,8 @@ import (
// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
+//
+// +marshal
type SignalContext64 struct {
FaultAddr uint64
Regs [31]uint64
@@ -36,6 +39,7 @@ type SignalContext64 struct {
Reserved [3568]uint8
}
+// +marshal
type aarch64Ctx struct {
Magic uint32
Size uint32
@@ -43,6 +47,8 @@ type aarch64Ctx struct {
// FpsimdContext is equivalent to struct fpsimd_context on arm64
// (arch/arm64/include/uapi/asm/sigcontext.h).
+//
+// +marshal
type FpsimdContext struct {
Head aarch64Ctx
Fpsr uint32
@@ -51,13 +57,15 @@ type FpsimdContext struct {
}
// UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+//
+// +marshal
type UContext64 struct {
Flags uint64
Link uint64
Stack SignalStack
Sigset linux.SignalSet
// glibc uses a 1024-bit sigset_t
- _pad [(1024 - 64) / 8]byte
+ _pad [120]byte // (1024 - 64) / 8 = 120
// sigcontext must be aligned to 16-byte
_pad2 [8]byte
// last for future expansion
@@ -94,11 +102,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
},
Sigset: sigset,
}
-
- ucSize := binary.Size(uc)
- if ucSize < 0 {
- panic("can't get size of UContext64")
- }
+ ucSize := uc.SizeBytes()
// frameSize = ucSize + sizeof(siginfo).
// sizeof(siginfo) == 128.
@@ -119,14 +123,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
info.FixSignalCodeForUser()
// Set up the stack frame.
- infoAddr, err := st.Push(info)
- if err != nil {
+ if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
return err
}
- ucAddr, err := st.Push(uc)
- if err != nil {
+ infoAddr := st.Bottom
+ if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
return err
}
+ ucAddr := st.Bottom
// Set up registers.
c.Regs.Sp = uint64(st.Bottom)
@@ -147,11 +151,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
// Copy out the stack frame.
var uc UContext64
- if _, err := st.Pop(&uc); err != nil {
+ if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
var info SignalInfo
- if _, err := st.Pop(&info); err != nil {
+ if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 0fa738a1d..a1eae98f9 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -17,8 +17,8 @@
package arch
import (
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
const (
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 1108fa0bd..5f06c751d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -15,14 +15,16 @@
package arch
import (
- "encoding/binary"
- "fmt"
-
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/usermem"
)
-// Stack is a simple wrapper around a usermem.IO and an address.
+// Stack is a simple wrapper around a usermem.IO and an address. Stack
+// implements marshal.CopyContext, and marshallable values can be pushed or
+// popped from the stack through the marshal.Marshallable interface.
+//
+// Stack is not thread-safe.
type Stack struct {
// Our arch info.
// We use this for automatic Native conversion of usermem.Addrs during
@@ -34,105 +36,60 @@ type Stack struct {
// Our current stack bottom.
Bottom usermem.Addr
-}
-// Push pushes the given values on to the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
- for _, v := range vals {
-
- // We convert some types to well-known serializable quanities.
- var norm interface{}
-
- // For array types, we will automatically add an appropriate
- // terminal value. This is done simply to make the interface
- // easier to use.
- var term interface{}
-
- switch v.(type) {
- case string:
- norm = []byte(v.(string))
- term = byte(0)
- case []int8, []uint8:
- norm = v
- term = byte(0)
- case []int16, []uint16:
- norm = v
- term = uint16(0)
- case []int32, []uint32:
- norm = v
- term = uint32(0)
- case []int64, []uint64:
- norm = v
- term = uint64(0)
- case []usermem.Addr:
- // Special case: simply push recursively.
- _, err := s.Push(s.Arch.Native(uintptr(0)))
- if err != nil {
- return 0, err
- }
- varr := v.([]usermem.Addr)
- for i := len(varr) - 1; i >= 0; i-- {
- _, err := s.Push(varr[i])
- if err != nil {
- return 0, err
- }
- }
- continue
- case usermem.Addr:
- norm = s.Arch.Native(uintptr(v.(usermem.Addr)))
- default:
- norm = v
- }
+ // Scratch buffer used for marshalling to avoid having to repeatedly
+ // allocate scratch memory.
+ scratchBuf []byte
+}
- if term != nil {
- _, err := s.Push(term)
- if err != nil {
- return 0, err
- }
- }
+// scratchBufLen is the default length of Stack.scratchBuf. The
+// largest structs the stack regularly serializes are arch.SignalInfo
+// and arch.UContext64. We'll set the default size as the larger of
+// the two, arch.UContext64.
+var scratchBufLen = (*UContext64)(nil).SizeBytes()
- c := binary.Size(norm)
- if c < 0 {
- return 0, fmt.Errorf("bad binary.Size for %T", v)
- }
- n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
- if err != nil || c != n {
- return 0, err
- }
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (s *Stack) CopyScratchBuffer(size int) []byte {
+ if len(s.scratchBuf) < size {
+ s.scratchBuf = make([]byte, size)
+ }
+ return s.scratchBuf[:size]
+}
+// StackBottomMagic is the special address callers must past to all stack
+// marshalling operations to cause the src/dst address to be computed based on
+// the current end of the stack.
+const StackBottomMagic = ^usermem.Addr(0) // usermem.Addr(-1)
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes
+// computes an appropriate address based on the current end of the
+// stack. Callers use the sentinel address StackBottomMagic to marshal methods
+// to indicate this.
+func (s *Stack) CopyOutBytes(sentinel usermem.Addr, b []byte) (int, error) {
+ if sentinel != StackBottomMagic {
+ panic("Attempted to copy out to stack with absolute address")
+ }
+ c := len(b)
+ n, err := s.IO.CopyOut(context.Background(), s.Bottom-usermem.Addr(c), b, usermem.IOOpts{})
+ if err == nil && n == c {
s.Bottom -= usermem.Addr(n)
}
-
- return s.Bottom, nil
+ return n, err
}
-// Pop pops the given values off the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
- for _, v := range vals {
-
- vaddr, isVaddr := v.(*usermem.Addr)
-
- var n int
- var err error
- if isVaddr {
- value := s.Arch.Native(uintptr(0))
- n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
- *vaddr = usermem.Addr(s.Arch.Value(value))
- } else {
- n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
- }
- if err != nil {
- return 0, err
- }
-
+// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes
+// an appropriate address based on the current end of the stack. Callers must
+// use the sentinel address StackBottomMagic to marshal methods to indicate
+// this.
+func (s *Stack) CopyInBytes(sentinel usermem.Addr, b []byte) (int, error) {
+ if sentinel != StackBottomMagic {
+ panic("Attempted to copy in from stack with absolute address")
+ }
+ n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{})
+ if err == nil {
s.Bottom += usermem.Addr(n)
}
-
- return s.Bottom, nil
+ return n, err
}
// Align aligns the stack to the given offset.
@@ -142,6 +99,22 @@ func (s *Stack) Align(offset int) {
}
}
+// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null
+// byte at the end. On error, the contents of the stack and the bottom cursor
+// are undefined.
+func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) {
+ // Note: Stack grows up, so write the terminal null byte first.
+ nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs)
+ if err != nil {
+ return 0, err
+ }
+ return n + nNull, nil
+}
+
// StackLayout describes the location of the arguments and environment on the
// stack.
type StackLayout struct {
@@ -177,11 +150,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
l.EnvvEnd = s.Bottom
envAddrs := make([]usermem.Addr, len(env))
for i := len(env) - 1; i >= 0; i-- {
- addr, err := s.Push(env[i])
- if err != nil {
+ if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil {
return StackLayout{}, err
}
- envAddrs[i] = addr
+ envAddrs[i] = s.Bottom
}
l.EnvvStart = s.Bottom
@@ -189,11 +161,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
l.ArgvEnd = s.Bottom
argAddrs := make([]usermem.Addr, len(args))
for i := len(args) - 1; i >= 0; i-- {
- addr, err := s.Push(args[i])
- if err != nil {
+ if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil {
return StackLayout{}, err
}
- argAddrs[i] = addr
+ argAddrs[i] = s.Bottom
}
l.ArgvStart = s.Bottom
@@ -222,26 +193,26 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
auxv = append(auxv, usermem.Addr(a.Key), a.Value)
}
auxv = append(auxv, usermem.Addr(0))
- _, err := s.Push(auxv)
+ _, err := s.pushAddrSliceAndTerminator(auxv)
if err != nil {
return StackLayout{}, err
}
// Push environment.
- _, err = s.Push(envAddrs)
+ _, err = s.pushAddrSliceAndTerminator(envAddrs)
if err != nil {
return StackLayout{}, err
}
// Push args.
- _, err = s.Push(argAddrs)
+ _, err = s.pushAddrSliceAndTerminator(argAddrs)
if err != nil {
return StackLayout{}, err
}
// Push arg count.
- _, err = s.Push(usermem.Addr(len(args)))
- if err != nil {
+ lenP := s.Arch.Native(uintptr(len(args)))
+ if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil {
return StackLayout{}, err
}
diff --git a/pkg/sentry/arch/stack_unsafe.go b/pkg/sentry/arch/stack_unsafe.go
new file mode 100644
index 000000000..a90d297ee
--- /dev/null
+++ b/pkg/sentry/arch/stack_unsafe.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+ "reflect"
+ "runtime"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// pushAddrSliceAndTerminator copies a slices of addresses to the stack, and
+// also pushes an extra null address element at the end of the slice.
+//
+// Internally, we unsafely transmute the slice type from the arch-dependent
+// []usermem.Addr type, to a slice of fixed-sized ints so that we can pass it to
+// go-marshal.
+//
+// On error, the contents of the stack and the bottom cursor are undefined.
+func (s *Stack) pushAddrSliceAndTerminator(src []usermem.Addr) (int, error) {
+ // Note: Stack grows upwards, so push the terminator first.
+ srcHdr := (*reflect.SliceHeader)(unsafe.Pointer(&src))
+ switch s.Arch.Width() {
+ case 8:
+ nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ var dst []uint64
+ dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+ dstHdr.Data = srcHdr.Data
+ dstHdr.Len = srcHdr.Len
+ dstHdr.Cap = srcHdr.Cap
+ n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, dst)
+ // Ensures src doesn't get GCed until we're done using it through dst.
+ runtime.KeepAlive(src)
+ return n + nNull, err
+ case 4:
+ nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ var dst []uint32
+ dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+ dstHdr.Data = srcHdr.Data
+ dstHdr.Len = srcHdr.Len
+ dstHdr.Cap = srcHdr.Cap
+ n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, dst)
+ // Ensure src doesn't get GCed until we're done using it through dst.
+ runtime.KeepAlive(src)
+ return n + nNull, err
+ default:
+ panic("Unsupported arch width")
+ }
+}
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2c5d14be5..deaf5fa23 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -35,7 +35,6 @@ go_library(
"//pkg/sync",
"//pkg/tcpip/link/sniffer",
"//pkg/urpc",
- "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index dfa936563..668f47802 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,8 +23,8 @@ import (
"text/tabwriter"
"time"
- "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -203,27 +203,17 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
}
initArgs.Filename = resolved
- fds := make([]int, len(args.FilePayload.Files))
- for i, file := range args.FilePayload.Files {
- if kernel.VFS2Enabled {
- // Need to dup to remove ownership from os.File.
- dup, err := unix.Dup(int(file.Fd()))
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
- }
- fds[i] = dup
- } else {
- // VFS1 dups the file on import.
- fds[i] = int(file.Fd())
- }
+ fds, err := fd.NewFromFiles(args.Files)
+ if err != nil {
+ return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
}
+ defer func() {
+ for _, fd := range fds {
+ _ = fd.Close()
+ }
+ }()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
if err != nil {
- if kernel.VFS2Enabled {
- for _, fd := range fds {
- unix.Close(fd)
- }
- }
return nil, 0, nil, nil, err
}
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
index abe58f818..4c8604d58 100644
--- a/pkg/sentry/devices/memdev/BUILD
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -18,9 +18,10 @@ go_library(
"//pkg/rand",
"//pkg/safemem",
"//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/mm",
- "//pkg/sentry/pgalloc",
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index 511179e31..fece3e762 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -24,6 +24,8 @@ import (
const fullDevMinor = 7
// fullDevice implements vfs.Device for /dev/full.
+//
+// +stateify savable
type fullDevice struct{}
// Open implements vfs.Device.Open.
@@ -38,6 +40,8 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// fullFD implements vfs.FileDescriptionImpl for /dev/full.
+//
+// +stateify savable
type fullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 4918dbeeb..ff5837747 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -25,6 +25,8 @@ import (
const nullDevMinor = 3
// nullDevice implements vfs.Device for /dev/null.
+//
+// +stateify savable
type nullDevice struct{}
// Open implements vfs.Device.Open.
@@ -39,6 +41,8 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// nullFD implements vfs.FileDescriptionImpl for /dev/null.
+//
+// +stateify savable
type nullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index 5e7fe0280..ac943e3ba 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -30,6 +30,8 @@ const (
)
// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+//
+// +stateify savable
type randomDevice struct{}
// Open implements vfs.Device.Open.
@@ -44,6 +46,8 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry,
}
// randomFD implements vfs.FileDescriptionImpl for /dev/random.
+//
+// +stateify savable
type randomFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 2e631a252..1929e41cd 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -16,9 +16,10 @@ package memdev
import (
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -26,6 +27,8 @@ import (
const zeroDevMinor = 5
// zeroDevice implements vfs.Device for /dev/zero.
+//
+// +stateify savable
type zeroDevice struct{}
// Open implements vfs.Device.Open.
@@ -40,6 +43,8 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+//
+// +stateify savable
type zeroFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -79,11 +84,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64,
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+ if opts.Private || !opts.MaxPerms.Write {
+ // This mapping will never permit writing to the "underlying file" (in
+ // Linux terms, it isn't VM_SHARED), so implement it as an anonymous
+ // mapping, but back it with fd; this is what Linux does, and is
+ // actually application-visible because the resulting VMA will show up
+ // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
+ // "/dev/zero (deleted)".
+ opts.Offset = 0
+ opts.MappingIdentity = &fd.vfsfd
+ opts.MappingIdentity.IncRef()
+ return nil
+ }
+ tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
if err != nil {
return err
}
- opts.MappingIdentity = m
- opts.Mappable = m
- return nil
+ defer tmpfsFD.DecRef(ctx)
+ return tmpfsFD.ConfigureMMap(ctx, opts)
}
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
index 664e54498..a287c65ca 100644
--- a/pkg/sentry/devices/ttydev/ttydev.go
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -30,6 +30,8 @@ const (
)
// ttyDevice implements vfs.Device for /dev/tty.
+//
+// +stateify savable
type ttyDevice struct{}
// Open implements vfs.Device.Open.
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
index 5e41ceb4e..6b4f8b0ed 100644
--- a/pkg/sentry/fdimport/BUILD
+++ b/pkg/sentry/fdimport/BUILD
@@ -10,6 +10,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/context",
+ "//pkg/fd",
"//pkg/sentry/fs",
"//pkg/sentry/fs/host",
"//pkg/sentry/fsimpl/host",
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index 1b7cb94c0..314661475 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
@@ -27,8 +28,9 @@ import (
// Import imports a slice of FDs into the given FDTable. If console is true,
// sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
-// stderr. Upon success, Import takes ownership of all FDs.
-func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+// stderr. Used FDs are either closed or released. It's safe for the caller to
+// close any remaining files upon return.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if kernel.VFS2Enabled {
ttyFile, err := importVFS2(ctx, fdTable, console, fds)
return nil, ttyFile, err
@@ -37,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in
return ttyFile, nil, err
}
-func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) {
var ttyFile *fs.File
for appFD, hostFD := range fds {
var appFile *fs.File
@@ -46,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
// Import the file as a host TTY file.
if ttyFile == nil {
var err error
- appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+ appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */)
if err != nil {
return nil, err
}
defer appFile.DecRef(ctx)
+ _ = hostFD.Close() // FD is dup'd i ImportFile.
// Remember this in the TTY file, as we will
// use it for the other stdio FDs.
@@ -65,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
} else {
// Import the file as a regular host file.
var err error
- appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+ appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */)
if err != nil {
return nil, err
}
defer appFile.DecRef(ctx)
+ _ = hostFD.Close() // FD is dup'd i ImportFile.
}
// Add the file to the FD map.
@@ -84,7 +88,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
return ttyFile.FileOperations.(*host.TTYFileOperations), nil
}
-func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) {
k := kernel.KernelFromContext(ctx)
if k == nil {
return nil, fmt.Errorf("cannot find kernel from context")
@@ -98,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
// Import the file as a host TTY file.
if ttyFile == nil {
var err error
- appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+ appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */)
if err != nil {
return nil, err
}
defer appFile.DecRef(ctx)
+ hostFD.Release() // FD is transfered to host FD.
// Remember this in the TTY file, as we will use it for the other stdio
// FDs.
@@ -115,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
}
} else {
var err error
- appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+ appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */)
if err != nil {
return nil, err
}
defer appFile.DecRef(ctx)
+ hostFD.Release() // FD is transfered to host FD.
}
if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index 2ca84dd74..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -79,7 +79,7 @@ ops can be implemented in parallel.
- Implement `/dev/fuse` - a character device used to establish an FD for
communication between the sentry and the server daemon.
-- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+- Implement basic FUSE ops like `FUSE_INIT`.
#### Read-only mount with basic file operations
@@ -95,6 +95,103 @@ ops can be implemented in parallel.
- Implement the remaining FUSE ops and decide if we can omit rarely used
operations like ioctl.
+### Design Details
+
+#### Lifecycle for a FUSE Request
+
+- User invokes a syscall
+- Sentry prepares corresponding request
+ - If FUSE device is available
+ - Write the request in binary
+ - If FUSE device is full
+ - Kernel task blocked until available
+- Sentry notifies the readers of fuse device that it's ready for read
+- FUSE daemon reads the request and processes it
+- Sentry waits until a reply is written to the FUSE device
+ - but returns directly for async requests
+- FUSE daemon writes to the fuse device
+- Sentry processes the reply
+ - For sync requests, unblock blocked kernel task
+ - For async requests, execute pre-specified callback if any
+- Sentry returns the syscall to the user
+
+#### Channels and Queues for Requests in Different Stages
+
+`connection.initializedChan`
+
+- a channel that the requests issued before connection initialization blocks
+ on.
+
+`fd.queue`
+
+- a queue of requests that haven’t been read by the FUSE daemon yet.
+
+`fd.completions`
+
+- a map of the requests that have been prepared but not yet received a
+ response, including the ones on the `fd.queue`.
+
+`fd.waitQueue`
+
+- a queue of waiters that is waiting for the fuse device fd to be available,
+ such as the FUSE daemon.
+
+`fd.fullQueueCh`
+
+- a channel that the kernel task will be blocked on when the fd is not
+ available.
+
+#### Basic I/O Implementation
+
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
+
+##### Basic FUSE Read
+
+The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
+`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually
+reach our implementation of those interface functions located at
+`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
+
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
+
+##### Basic FUSE Write
+
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
+
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+
+The parameters `max_write` and `max_pages` restrict the number of bytes in one
+`FUSE_WRITE`. There are code handling fragmented writes in current
+implementation.
+
+To have better performance: the extra copy created to store the bytes to write
+can be replaced by the buffer cache as well.
+
# Appendix
## FUSE Protocol
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 42a6c41c2..1368014c4 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/fdnotifier",
"//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/secio",
@@ -55,7 +56,6 @@ go_library(
"//pkg/unet",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5d4f312cf..c8231e0aa 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 67a807f9d..1183727ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -24,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// LINT.IfChange
@@ -54,7 +54,7 @@ type TTYFileOperations struct {
func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
fileOperations: fileOperations{iops: iops},
- termios: linux.DefaultSlaveTermios,
+ termios: linux.DefaultReplicaTermios,
})
}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b79cd9877..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
// SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
if i.overlay != nil {
- return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+ return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
}
return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index dc2e353d9..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
import (
"fmt"
- "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
// Don't forward the value of the extended attribute if it would
// unexpectedly change the behavior of a wrapping overlay layer.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return "", syserror.ENODATA
}
@@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
return s, err
}
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
// Don't allow changes to overlay xattrs through a setxattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
for name := range names {
// Same as overlayGetXattr, we shouldn't forward along
// overlay attributes.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
delete(names, name)
}
}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
// Don't allow changes to overlay xattrs through a removexattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 77c2c5c0e..b8b2281a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -50,6 +50,7 @@ go_library(
"//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
+ "//pkg/tcpip/network/ipv4",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 8615b60f0..e555672ad 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -54,7 +55,7 @@ type tcpMemInode struct {
// size stores the tcp buffer size during save, and sets the buffer
// size in netstack in restore. We must save/restore this here, since
- // netstack itself is stateless.
+ // a netstack instance is created on restore.
size inet.TCPBufferSize
// mu protects against concurrent reads/writes to files based on this
@@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
if src.NumBytes() == 0 {
return 0, nil
}
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
src = src.TakeFirst(usermem.PageSize - 1)
var v int32
@@ -383,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
+// ipForwarding implements fs.InodeOperations.
+//
+// ipForwarding is used to enable/disable packet forwarding of netstack.
+//
+// +stateify savable
+type ipForwarding struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+
+ // enabled stores the IPv4 forwarding state on save.
+ // We must save/restore this here, since a netstack instance
+ // is created on restore.
+ enabled *bool
+}
+
+func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ipf := &ipForwarding{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ipf, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// +stateify savable
+type ipForwardingFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ ipf *ipForwarding
+
+ stack inet.Stack `state:"wait"`
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{
+ stack: ipf.stack,
+ ipf: ipf,
+ }), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ if f.ipf.enabled == nil {
+ enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
+ f.ipf.enabled = &enabled
+ }
+
+ val := "0\n"
+ if *f.ipf.enabled {
+ // Technically, this is not quite compatible with Linux. Linux
+ // stores these as an integer, so if you write "2" into
+ // ip_forward, you should get 2 back.
+ val = "1\n"
+ }
+ n, err := dst.CopyOut(ctx, []byte(val))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Offset is ignored, multiple writes are not supported.
+func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return n, err
+ }
+ if f.ipf.enabled == nil {
+ f.ipf.enabled = new(bool)
+ }
+ *f.ipf.enabled = v != 0
+ return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+}
+
func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
contents := map[string]*fs.Inode{
// Add tcp_sack.
"tcp_sack": newTCPSackInode(ctx, msrc, s),
+ // Add ip_forward.
+ "ip_forward": newIPForwardingInode(ctx, msrc, s),
+
// The following files are simple stubs until they are
// implemented in netstack, most of these files are
// configuration related. We use the value closest to the
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 6eba709c6..4cb4741af 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -14,7 +14,11 @@
package proc
-import "fmt"
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+)
// beforeSave is invoked by stateify.
func (t *tcpMemInode) beforeSave() {
@@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() {
}
}
}
+
+// afterLoad is invoked by stateify.
+func (ipf *ipForwarding) afterLoad() {
+ if ipf.enabled != nil {
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
+ }
+ }
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 355e83d47..6ef5738e7 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) {
}
}
}
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestIPForwarding(t *testing.T) {
+ ctx := context.Background()
+ s := inet.NewTestStack()
+
+ var cases = []struct {
+ comment string
+ initial bool
+ str string
+ final bool
+ }{
+ {
+ comment: `Forwarding is disabled; write 1 and enable forwarding`,
+ initial: false,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is disabled; write 0 and disable forwarding`,
+ initial: false,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is enabled; write 1 and enable forwarding`,
+ initial: true,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 0 and disable forwarding`,
+ initial: true,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+ initial: false,
+ str: "2404",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+ initial: true,
+ str: "2404",
+ final: true,
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.comment, func(t *testing.T) {
+ s.IPForwarding = c.initial
+ ipf := &ipForwarding{stack: s}
+ file := &ipForwardingFile{
+ stack: s,
+ ipf: ipf,
+ }
+
+ // Write the values.
+ src := usermem.BytesIOSequence([]byte(c.str))
+ if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+ }
+
+ // Read the values from the stack and check them.
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+ }
+
+ })
+ }
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9cf7f2a62..22d658acf 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
"auxv": newAuxvec(t, msrc),
"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
"comm": newComm(t, msrc),
+ "cwd": newCwd(t, msrc),
"environ": newExecArgInode(t, msrc, environExecArg),
"exe": newExe(t, msrc),
"fd": newFdDir(t, msrc),
@@ -300,6 +301,49 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
return exec.PathnameWithDeleted(ctx), nil
}
+// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
+//
+// +stateify savable
+type cwd struct {
+ ramfs.Symlink
+
+ t *kernel.Task
+}
+
+func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+ cwdSymlink := &cwd{
+ Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+ t: t,
+ }
+ return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t)
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+ if !kernel.ContextCanTrace(ctx, e.t, false) {
+ return "", syserror.EACCES
+ }
+ if err := checkTaskState(e.t); err != nil {
+ return "", err
+ }
+ cwd := e.t.FSContext().WorkingDirectory()
+ if cwd == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer cwd.DecRef(ctx)
+
+ root := fs.RootFromContext(ctx)
+ if root == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ name, _ := cwd.FullName(root)
+ return name, nil
+}
+
// namespaceSymlink represents a symlink in the namespacefs, such as the files
// in /proc/<pid>/ns.
//
@@ -604,7 +648,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..e6d0eb359 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,13 +10,14 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
- "slave.go",
+ "replica.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 463f6189e..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
// This indirectly manages all terminals within the mount.
//
// New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
// is no longer discoverable at that point.
//
// References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
//
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
// dirInodeOperations, which is reference counted by the refcount their
// corresponding Dirents hold on their parent (this directory).
//
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
// master is the master PTY inode.
master *fs.Inode
- // slaves contains the slave inodes reachable from the directory.
+ // replicas contains the replica inodes reachable from the directory.
//
- // A new slave is added by allocateTerminal and is removed by
+ // A new replica is added by allocateTerminal and is removed by
// masterFileOperations.Release.
//
- // A reference is held on every slave in the map.
- slaves map[uint32]*fs.Inode
+ // A reference is held on every replica in the map.
+ replicas map[uint32]*fs.Inode
// dentryMap is a SortedDentryMap used to implement Readdir containing
- // the master and all entries in slaves.
+ // the master and all entries in replicas.
dentryMap *fs.SortedDentryMap
// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
d := &dirInodeOperations{
InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
msrc: m,
- slaves: make(map[uint32]*fs.Inode),
+ replicas: make(map[uint32]*fs.Inode),
dentryMap: fs.NewSortedDentryMap(nil),
}
// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -133,7 +133,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
defer d.mu.Unlock()
d.master.DecRef(ctx)
- if len(d.slaves) != 0 {
+ if len(d.replicas) != 0 {
panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
}
}
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
return fs.NewDirent(ctx, d.master, name), nil
}
- // Slave number?
+ // Replica number?
n, err := strconv.ParseUint(name, 10, 32)
if err != nil {
// Not found.
return nil, syserror.ENOENT
}
- s, ok := d.slaves[uint32(n)]
+ s, ok := d.replicas[uint32(n)]
if !ok {
return nil, syserror.ENOENT
}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
return nil, syserror.ENOMEM
}
- if _, ok := d.slaves[n]; ok {
+ if _, ok := d.replicas[n]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", n))
}
@@ -244,19 +244,19 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
d.next++
// The reference returned by newTerminal is returned to the caller.
- // Take another for the slave inode.
+ // Take another for the replica inode.
t.IncRef()
// Create a pts node. The owner is based on the context that opens
// ptmx.
creds := auth.CredentialsFromContext(ctx)
uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
- slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+ replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
- d.slaves[n] = slave
+ d.replicas[n] = replica
d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
- Type: slave.StableAttr.Type,
- InodeID: slave.StableAttr.InodeID,
+ Type: replica.StableAttr.Type,
+ InodeID: replica.StableAttr.InodeID,
})
return t, nil
@@ -267,18 +267,18 @@ func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
d.mu.Lock()
defer d.mu.Unlock()
- // The slave end disappears from the directory when the master end is
- // closed, even if the slave end is open elsewhere.
+ // The replica end disappears from the directory when the master end is
+ // closed, even if the replica end is open elsewhere.
//
// N.B. since we're using a backdoor method to remove a directory entry
// we won't properly fire inotify events like Linux would.
- s, ok := d.slaves[t.n]
+ s, ok := d.replicas[t.n]
if !ok {
panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
}
s.DecRef(ctx)
- delete(d.slaves, t.n)
+ delete(d.replicas, t.n)
d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
}
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 2d4d44bf3..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
//
// It always returns true, forcing a Lookup for all entries.
//
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
// still exists on the filesystem.
func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
return true
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -43,7 +44,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -54,8 +55,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -64,7 +65,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -103,8 +104,8 @@ type lineDiscipline struct {
// masterWaiter is used to wait on the master end of the TTY.
masterWaiter waiter.Queue `state:"zerovalue"`
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue `state:"zerovalue"`
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e00746017..b91184b1b 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
// Ioctl implements fs.FileOperations.Ioctl.
func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mf.t.ld.outputQueueReadSize(t, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mf.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mf.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mf.t.ld.setTermios(ctx, io, args)
+ return mf.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mf.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mf.t.ld.windowSize(ctx, io, args)
+ return 0, mf.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mf.t.ld.setWindowSize(ctx, io, args)
+ return 0, mf.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index c5d7ec717..79975d812 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -17,8 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -32,7 +34,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go
index 7c7292687..385d230fb 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/replica.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
@@ -27,11 +29,11 @@ import (
// LINT.IfChange
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
// Terminal (pts file).
//
// +stateify savable
-type slaveInodeOperations struct {
+type replicaInodeOperations struct {
fsutil.SimpleFileInode
// d is the containing dir.
@@ -41,13 +43,13 @@ type slaveInodeOperations struct {
t *Terminal
}
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
//
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
- iops := &slaveInodeOperations{
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+ iops := &replicaInodeOperations{
SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
d: d,
t: t,
@@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
Type: fs.CharacterDevice,
// See fs/devpts/inode.c:devpts_fill_super.
BlockSize: 1024,
- DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+ DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
DeviceFileMinor: t.n,
})
}
// Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
+func (si *replicaInodeOperations) Release(ctx context.Context) {
si.t.DecRef(ctx)
}
// Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
return nil
}
@@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
//
// This may race with destruction of the terminal. If the terminal is gone, it
// returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
- return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
}
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
//
// +stateify savable
-type slaveFileOperations struct {
+type replicaFileOperations struct {
fsutil.FilePipeSeek `state:"nosave"`
fsutil.FileNotDirReaddir `state:"nosave"`
fsutil.FileNoFsync `state:"nosave"`
@@ -100,79 +103,84 @@ type slaveFileOperations struct {
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
// si is the inode operations.
- si *slaveInodeOperations
+ si *replicaInodeOperations
}
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release(context.Context) {
+func (sf *replicaFileOperations) Release(context.Context) {
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
- sf.si.t.ld.slaveWaiter.EventUnregister(e)
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+ sf.si.t.ld.replicaWaiter.EventUnregister(e)
}
// Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sf.si.t.ld.slaveReadiness()
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return sf.si.t.ld.replicaReadiness()
}
// Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.inputQueueRead(ctx, dst)
}
// Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.outputQueueWrite(ctx, src)
}
// Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the input queue read buffer.
- return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+ return 0, sf.si.t.ld.inputQueueReadSize(t, args)
case linux.TCGETS:
- return sf.si.t.ld.getTermios(ctx, io, args)
+ return sf.si.t.ld.getTermios(t, args)
case linux.TCSETS:
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(sf.si.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCGWINSZ:
- return 0, sf.si.t.ld.windowSize(ctx, io, args)
+ return 0, sf.si.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+ return 0, sf.si.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
}
}
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..4f431d74d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
@@ -44,19 +44,19 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- d: d,
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ d: d,
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
t.EnableLeakCheck("tty.Terminal")
return &t
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
// LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 3f64fab3a..48e13613a 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -21,8 +21,8 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
+ "replica.go",
"root_inode_refs.go",
- "slave.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
@@ -30,6 +30,8 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 57580f4d4..903135fae 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,6 +35,8 @@ import (
const Name = "devpts"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -58,6 +60,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -79,7 +82,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
// Construct the root directory. This is always inode id 1.
root := &rootInode{
- slaves: make(map[uint32]*slaveInode),
+ replicas: make(map[uint32]*replicaInode),
}
root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
@@ -110,6 +113,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
type rootInode struct {
implStatFS
kernfs.AlwaysValid
@@ -131,10 +136,10 @@ type rootInode struct {
root *rootInode
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
- // slaves maps pty ids to slave inodes.
- slaves map[uint32]*slaveInode
+ // replicas maps pty ids to replica inodes.
+ replicas map[uint32]*replicaInode
// nextIdx is the next pty index to use. Must be accessed atomically.
//
@@ -154,22 +159,22 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
idx := i.nextIdx
i.nextIdx++
- // Sanity check that slave with idx does not exist.
- if _, ok := i.slaves[idx]; ok {
+ // Sanity check that replica with idx does not exist.
+ if _, ok := i.replicas[idx]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
}
- // Create the new terminal and slave.
+ // Create the new terminal and replica.
t := newTerminal(idx)
- slave := &slaveInode{
+ replica := &replicaInode{
root: i,
t: t,
}
// Linux always uses pty index + 3 as the inode id. See
// fs/devpts/inode.c:devpts_pty_new().
- slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
- slave.dentry.Init(slave)
- i.slaves[idx] = slave
+ replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+ replica.dentry.Init(replica)
+ i.replicas[idx] = replica
return t, nil
}
@@ -179,16 +184,16 @@ func (i *rootInode) masterClose(t *Terminal) {
i.mu.Lock()
defer i.mu.Unlock()
- // Sanity check that slave with idx exists.
- if _, ok := i.slaves[t.n]; !ok {
+ // Sanity check that replica with idx exists.
+ if _, ok := i.replicas[t.n]; !ok {
panic(fmt.Sprintf("pty with index %d does not exist", t.n))
}
- delete(i.slaves, t.n)
+ delete(i.replicas, t.n)
}
// Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndStaticEntries,
})
if err != nil {
@@ -198,16 +203,16 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
}
// Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
idx, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
}
i.mu.Lock()
defer i.mu.Unlock()
- if si, ok := i.slaves[uint32(idx)]; ok {
+ if si, ok := i.replicas[uint32(idx)]; ok {
si.dentry.IncRef()
- return si.dentry.VFSDentry(), nil
+ return &si.dentry, nil
}
return nil, syserror.ENOENT
@@ -217,8 +222,8 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
i.mu.Lock()
defer i.mu.Unlock()
- ids := make([]int, 0, len(i.slaves))
- for id := range i.slaves {
+ ids := make([]int, 0, len(i.replicas))
+ for id := range i.replicas {
ids = append(ids, int(id))
}
sort.Ints(ids)
@@ -226,7 +231,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
dirent := vfs.Dirent{
Name: strconv.FormatUint(uint64(id), 10),
Type: linux.DT_CHR,
- Ino: i.slaves[uint32(id)].InodeAttrs.Ino(),
+ Ino: i.replicas[uint32(id)].InodeAttrs.Ino(),
NextOff: offset + 1,
}
if err := cb.Handle(dirent); err != nil {
@@ -237,11 +242,12 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
return offset, nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *rootInode) DecRef(context.Context) {
i.rootInodeRefs.DecRef(i.Destroy)
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..e6b0e81cf 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -52,8 +53,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -62,7 +63,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -101,8 +102,8 @@ type lineDiscipline struct {
// masterWaiter is used to wait on the master end of the TTY.
masterWaiter waiter.Queue `state:"zerovalue"`
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue `state:"zerovalue"`
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 60feb1993..69c2fe951 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,9 +17,11 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -29,6 +31,8 @@ import (
)
// masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
type masterInode struct {
implStatFS
kernfs.InodeAttrs
@@ -48,20 +52,18 @@ type masterInode struct {
var _ kernfs.Inode = (*masterInode)(nil)
// Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
t, err := mi.root.allocateTerminal(rp.Credentials())
if err != nil {
return nil, err
}
- mi.IncRef()
fd := &masterFileDescription{
inode: mi,
t: t,
}
fd.LockFD.Init(&mi.locks)
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- mi.DecRef(ctx)
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -87,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}
+// +stateify savable
type masterFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -101,7 +104,6 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
func (mfd *masterFileDescription) Release(ctx context.Context) {
mfd.inode.root.masterClose(mfd.t)
- mfd.inode.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -131,46 +133,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mfd.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mfd.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mfd.t.ld.setTermios(ctx, io, args)
+ return mfd.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mfd.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mfd.t.ld.windowSize(ctx, io, args)
+ return 0, mfd.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+ return 0, mfd.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index 331c13997..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..6515c5536
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,204 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
+type replicaInode struct {
+ implStatFS
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+
+ locks vfs.FileLocks
+
+ // Keep a reference to this inode's dentry.
+ dentry kernfs.Dentry
+
+ // root is the devpts root inode.
+ root *rootInode
+
+ // t is the connected Terminal.
+ t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd := &replicaFileDescription{
+ inode: ri,
+ }
+ fd.LockFD.Init(&ri.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (ri *replicaInode) Valid(context.Context) bool {
+ // Return valid if the replica still exists.
+ ri.root.mu.Lock()
+ defer ri.root.mu.Unlock()
+ _, ok := ri.root.replicas[ri.t.n]
+ return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ statx.Blksize = 1024
+ statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+ statx.RdevMinor = ri.t.n
+ return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+ return syserror.EINVAL
+ }
+ return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+// +stateify savable
+type replicaFileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+ rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return rfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+ return rfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+ return rfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+ // Get the number of bytes in the input queue read buffer.
+ return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
+ case linux.TCGETS:
+ return rfd.inode.t.ld.getTermios(t, args)
+ case linux.TCSETS:
+ return rfd.inode.t.ld.setTermios(t, args)
+ case linux.TCSETSW:
+ // TODO(b/29356795): This should drain the output queue first.
+ return rfd.inode.t.ld.setTermios(t, args)
+ case linux.TIOCGPTN:
+ nP := primitive.Uint32(rfd.inode.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
+ return 0, err
+ case linux.TIOCGWINSZ:
+ return 0, rfd.inode.t.ld.windowSize(t, args)
+ case linux.TIOCSWINSZ:
+ return 0, rfd.inode.t.ld.setWindowSize(t, args)
+ case linux.TIOCSCTTY:
+ // Make the given terminal the controlling terminal of the
+ // calling process.
+ return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+ case linux.TIOCNOTTY:
+ // Release this process's controlling terminal.
+ return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+ case linux.TIOCGPGRP:
+ // Get the foreground process group.
+ return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+ case linux.TIOCSPGRP:
+ // Set the foreground process group.
+ return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+ default:
+ maybeEmitUnimplementedEvent(ctx, cmd)
+ return 0, syserror.ENOTTY
+ }
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ creds := auth.CredentialsFromContext(ctx)
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index a9da7af64..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
- "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
- implStatFS
- kernfs.InodeAttrs
- kernfs.InodeNoopRefCount
- kernfs.InodeNotDirectory
- kernfs.InodeNotSymlink
-
- locks vfs.FileLocks
-
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
-
- // root is the devpts root inode.
- root *rootInode
-
- // t is the connected Terminal.
- t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- si.IncRef()
- fd := &slaveFileDescription{
- inode: si,
- }
- fd.LockFD.Init(&si.locks)
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- si.DecRef(ctx)
- return nil, err
- }
- return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
- // Return valid if the slave still exists.
- si.root.mu.Lock()
- defer si.root.mu.Unlock()
- _, ok := si.root.slaves[si.t.n]
- return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
- if err != nil {
- return linux.Statx{}, err
- }
- statx.Blksize = 1024
- statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
- statx.RdevMinor = si.t.n
- return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- if opts.Stat.Mask&linux.STATX_SIZE != 0 {
- return syserror.EINVAL
- }
- return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
- vfsfd vfs.FileDescription
- vfs.FileDescriptionDefaultImpl
- vfs.LockFD
-
- inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
- sfd.inode.DecRef(ctx)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
- sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
- return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
- return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
- switch cmd := args[1].Uint(); cmd {
- case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
- // Get the number of bytes in the input queue read buffer.
- return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
- case linux.TCGETS:
- return sfd.inode.t.ld.getTermios(ctx, io, args)
- case linux.TCSETS:
- return sfd.inode.t.ld.setTermios(ctx, io, args)
- case linux.TCSETSW:
- // TODO(b/29356795): This should drain the output queue first.
- return sfd.inode.t.ld.setTermios(ctx, io, args)
- case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
- return 0, err
- case linux.TIOCGWINSZ:
- return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
- case linux.TIOCSWINSZ:
- return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
- case linux.TIOCSCTTY:
- // Make the given terminal the controlling terminal of the
- // calling process.
- return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
- case linux.TIOCNOTTY:
- // Release this process's controlling terminal.
- return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
- case linux.TIOCGPGRP:
- // Get the foreground process group.
- return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
- case linux.TIOCSPGRP:
- // Set the foreground process group.
- return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
- default:
- maybeEmitUnimplementedEvent(ctx, cmd)
- return 0, syserror.ENOTTY
- }
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- creds := auth.CredentialsFromContext(ctx)
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.Stat(ctx, fs, opts)
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
- return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
- return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
return &t
}
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 52f44f66d..6d1753080 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -33,8 +33,10 @@ import (
const Name = "devtmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct {
- initOnce sync.Once
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
initErr error
// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
@@ -80,7 +82,7 @@ type Accessor struct {
// NewAccessor returns an Accessor that supports creation of device special
// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 827a608cb..3a38b8bb4 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -48,7 +48,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu
})
// Create a test mount namespace with devtmpfs mounted at "/dev".
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..1c27ad700 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,9 +30,11 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
type EventFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -106,7 +108,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
return efd.hostfd, nil
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (efd *EventFileDescription) Release(context.Context) {
efd.mu.Lock()
defer efd.mu.Unlock()
@@ -119,7 +121,7 @@ func (efd *EventFileDescription) Release(context.Context) {
}
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
if dst.NumBytes() < 8 {
return 0, syscall.EINVAL
@@ -130,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
return 8, nil
}
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
if src.NumBytes() < 8 {
return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index abc610ef3..7b1eec3da 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -51,6 +51,8 @@ go_library(
"//pkg/fd",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
@@ -86,9 +88,9 @@ go_test(
library = ":ext",
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/marshal/primitive",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/ext/disklayout",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index a2cc9b59f..c349b886e 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,7 +59,11 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index 8bb104ff0..1165234f9 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -18,7 +18,7 @@ import (
"io"
"math"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -34,19 +34,19 @@ type blockMapFile struct {
// directBlks are the direct blocks numbers. The physical blocks pointed by
// these holds file data. Contains file blocks 0 to 11.
- directBlks [numDirectBlks]uint32
+ directBlks [numDirectBlks]primitive.Uint32
// indirectBlk is the physical block which contains (blkSize/4) direct block
// numbers (as uint32 integers).
- indirectBlk uint32
+ indirectBlk primitive.Uint32
// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
// block numbers (as uint32 integers).
- doubleIndirectBlk uint32
+ doubleIndirectBlk primitive.Uint32
// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
// indirect block numbers (as uint32 integers).
- tripleIndirectBlk uint32
+ tripleIndirectBlk primitive.Uint32
// coverage at (i)th index indicates the amount of file data a node at
// height (i) covers. Height 0 is the direct block.
@@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
}
blkMap := file.regFile.inode.diskInode.Data()
- binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
- binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
- binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
- binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+ for i := 0; i < numDirectBlks; i++ {
+ file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4])
+ }
+ file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4])
+ file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4])
+ file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4])
return file, nil
}
@@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
switch {
case offset < dirBlksEnd:
// Direct block.
- curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+ curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:])
case offset < indirBlkEnd:
// Indirect block.
- curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+ curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:])
case offset < doubIndirBlkEnd:
// Doubly indirect block.
- curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+ curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:])
default:
// Triply indirect block.
- curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+ curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:])
}
read += curR
@@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
read := 0
curChildOff := relFileOff % childCov
for i := startIdx; i < endIdx; i++ {
- var childPhyBlk uint32
+ var childPhyBlk primitive.Uint32
err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
if err != nil {
return read, err
}
- n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+ n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:])
read += n
if err != nil {
return read, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 6fa84e7aa..ed98b482e 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -20,7 +20,7 @@ import (
"testing"
"github.com/google/go-cmp/cmp"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
)
@@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
mockDisk := make([]byte, mockBMDiskSize)
var fileData []byte
blkNums := newBlkNumGen()
- var data []byte
+ off := 0
+ data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes())
// Write the direct blocks.
for i := 0; i < numDirectBlks; i++ {
- curBlkNum := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+ curBlkNum := primitive.Uint32(blkNums.next())
+ curBlkNum.MarshalBytes(data[off:])
+ off += curBlkNum.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...)
}
// Write to indirect block.
- indirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
-
- // Write to indirect block.
- doublyIndirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
-
- // Write to indirect block.
- triplyIndirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+ indirectBlk := primitive.Uint32(blkNums.next())
+ indirectBlk.MarshalBytes(data[off:])
+ off += indirectBlk.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...)
+
+ // Write to double indirect block.
+ doublyIndirectBlk := primitive.Uint32(blkNums.next())
+ doublyIndirectBlk.MarshalBytes(data[off:])
+ off += doublyIndirectBlk.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...)
+
+ // Write to triple indirect block.
+ triplyIndirectBlk := primitive.Uint32(blkNums.next())
+ triplyIndirectBlk.MarshalBytes(data[off:])
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...)
args := inodeArgs{
fs: &filesystem{
@@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN
var fileData []byte
for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
- curBlkNum := blkNums.next()
- copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
- fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+ curBlkNum := primitive.Uint32(blkNums.next())
+ curBlkNum.MarshalBytes(disk[off : off+4])
+ fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...)
}
return fileData
}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 7a1b4219f..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -20,6 +20,8 @@ import (
)
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0fc01668d..0ad79b381 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -16,7 +16,6 @@ package ext
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -28,6 +27,8 @@ import (
)
// directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
type directory struct {
inode inode
@@ -39,7 +40,7 @@ type directory struct {
// Lock Order (outermost locks must be taken first):
// directory.mu
// filesystem.mu
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// childList is a list containing (1) child dirents and (2) fake dirents
// (with diskDirent == nil) that represent the iteration position of
@@ -98,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
} else {
curDirent.diskDirent = &disklayout.DirentOld{}
}
- binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+ curDirent.diskDirent.UnmarshalBytes(buf)
if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
// Inode number and name length fields being set to 0 is used to indicate
@@ -120,6 +121,8 @@ func (i *inode) isDir() bool {
}
// dirent is the directory.childList node.
+//
+// +stateify savable
type dirent struct {
diskDirent disklayout.Dirent
@@ -129,6 +132,8 @@ type dirent struct {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 9bd9c76c0..d98a05dd8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,10 +22,11 @@ go_library(
"superblock_old.go",
"test_utils.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
+ "//pkg/marshal",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..0d56ae9da 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
// BlockGroup represents a Linux ext block group descriptor. An ext file system
// is split into a series of block groups. This provides an access layer to
// information needed to access and use a block group.
@@ -30,6 +34,8 @@ package disklayout
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
type BlockGroup interface {
+ marshal.Marshallable
+
// InodeTable returns the absolute block number of the block containing the
// inode table. This points to an array of Inode structs. Inode tables are
// statically allocated at mkfs time. The superblock records the number of
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..a35fa22a0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
@@ -17,6 +17,8 @@ package disklayout
// BlockGroup32Bit emulates the first half of struct ext4_group_desc in
// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
// 32-bit ext4 filesystems. It implements BlockGroup interface.
+//
+// +marshal
type BlockGroup32Bit struct {
BlockBitmapLo uint32
InodeBitmapLo uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..d54d1d345 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
@@ -18,6 +18,8 @@ package disklayout
// It is the block group descriptor struct for 64-bit ext4 filesystems.
// It implements BlockGroup interface. It is an extension of the 32-bit
// version of BlockGroup.
+//
+// +marshal
type BlockGroup64Bit struct {
// We embed the 32-bit struct here because 64-bit version is just an extension
// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..e4ce484e4 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
@@ -21,6 +21,8 @@ import (
// TestBlockGroupSize tests that the block group descriptor structs are of the
// correct size.
func TestBlockGroupSize(t *testing.T) {
- assertSize(t, BlockGroup32Bit{}, 32)
- assertSize(t, BlockGroup64Bit{}, 64)
+ var bgSmall BlockGroup32Bit
+ assertSize(t, &bgSmall, 32)
+ var bgBig BlockGroup64Bit
+ assertSize(t, &bgBig, 64)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 417b6cf65..568c8cb4c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -15,6 +15,7 @@
package disklayout
import (
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fs"
)
@@ -51,6 +52,8 @@ var (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
type Dirent interface {
+ marshal.Marshallable
+
// Inode returns the absolute inode number of the underlying inode.
// Inode number 0 signifies an unused dirent.
Inode() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..51f9c2946 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
@@ -29,12 +29,14 @@ import (
// Note: This struct can be of variable size on disk. The one described below
// is of maximum size and the FileName beyond NameLength bytes might contain
// garbage.
+//
+// +marshal
type DirentNew struct {
InodeNumber uint32
RecordLength uint16
NameLength uint8
FileTypeRaw uint8
- FileNameRaw [MaxFileName]byte
+ FileNameRaw [MaxFileName]byte `marshal:"unaligned"`
}
// Compiles only if DirentNew implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..d4b19e086 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
@@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs"
// Note: This struct can be of variable size on disk. The one described below
// is of maximum size and the FileName beyond NameLength bytes might contain
// garbage.
+//
+// +marshal
type DirentOld struct {
InodeNumber uint32
RecordLength uint16
NameLength uint16
- FileNameRaw [MaxFileName]byte
+ FileNameRaw [MaxFileName]byte `marshal:"unaligned"`
}
// Compiles only if DirentOld implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index 934919f8a..3486864dc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,6 +21,8 @@ import (
// TestDirentSize tests that the dirent structs are of the correct
// size.
func TestDirentSize(t *testing.T) {
- assertSize(t, DirentOld{}, uintptr(DirentSize))
- assertSize(t, DirentNew{}, uintptr(DirentSize))
+ var dOld DirentOld
+ assertSize(t, &dOld, DirentSize)
+ var dNew DirentNew
+ assertSize(t, &dNew, DirentSize)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..0834e9ba8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
@@ -36,8 +36,6 @@
// escape analysis on an unknown implementation at compile time.
//
// Notes:
-// - All fields in these structs are exported because binary.Read would
-// panic otherwise.
// - All structures on disk are in little-endian order. Only jbd2 (journal)
// structures are in big-endian order.
// - All OS dependent fields in these structures will be interpretted using
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 4110649ab..b13999bfc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
// Extents were introduced in ext4 and provide huge performance gains in terms
// data locality and reduced metadata block usage. Extents are organized in
// extent trees. The root node is contained in inode.BlocksRaw.
@@ -64,6 +68,8 @@ type ExtentNode struct {
// ExtentEntry represents an extent tree node entry. The entry can either be
// an ExtentIdx or Extent itself. This exists to simplify navigation logic.
type ExtentEntry interface {
+ marshal.Marshallable
+
// FileBlock returns the first file block number covered by this entry.
FileBlock() uint32
@@ -75,6 +81,8 @@ type ExtentEntry interface {
// tree node begins with this and is followed by `NumEntries` number of:
// - Extent if `Depth` == 0
// - ExtentIdx otherwise
+//
+// +marshal
type ExtentHeader struct {
// Magic in the extent magic number, must be 0xf30a.
Magic uint16
@@ -96,6 +104,8 @@ type ExtentHeader struct {
// internal nodes. Sorted in ascending order based on FirstFileBlock since
// Linux does a binary search on this. This points to a block containing the
// child node.
+//
+// +marshal
type ExtentIdx struct {
FirstFileBlock uint32
ChildBlockLo uint32
@@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 {
// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a
// binary search on this. This points to an array of data blocks containing the
// file data. It covers `Length` data blocks starting from `StartBlock`.
+//
+// +marshal
type Extent struct {
FirstFileBlock uint32
Length uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index 8762b90db..c96002e19 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,10 @@ import (
// TestExtentSize tests that the extent structs are of the correct
// size.
func TestExtentSize(t *testing.T) {
- assertSize(t, ExtentHeader{}, ExtentHeaderSize)
- assertSize(t, ExtentIdx{}, ExtentEntrySize)
- assertSize(t, Extent{}, ExtentEntrySize)
+ var h ExtentHeader
+ assertSize(t, &h, ExtentHeaderSize)
+ var i ExtentIdx
+ assertSize(t, &i, ExtentEntrySize)
+ var e Extent
+ assertSize(t, &e, ExtentEntrySize)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..ef25040a9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
@@ -16,6 +16,7 @@ package disklayout
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
)
@@ -38,6 +39,8 @@ const (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
type Inode interface {
+ marshal.Marshallable
+
// Mode returns the linux file mode which is majorly used to extract
// information like:
// - File permissions (read/write/execute by user/group/others).
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..a4503f5cf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
@@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
// are used to provide nanoscond precision. Hence, these timestamps will now
// overflow in May 2446.
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+//
+// +marshal
type InodeNew struct {
InodeOld
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..e6b28babf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
@@ -30,6 +30,8 @@ const (
//
// All fields representing time are in seconds since the epoch. Which means that
// they will overflow in January 2038.
+//
+// +marshal
type InodeOld struct {
ModeRaw uint16
UIDLo uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..90744e956 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
@@ -24,10 +24,12 @@ import (
// TestInodeSize tests that the inode structs are of the correct size.
func TestInodeSize(t *testing.T) {
- assertSize(t, InodeOld{}, OldInodeSize)
+ var iOld InodeOld
+ assertSize(t, &iOld, OldInodeSize)
// This was updated from 156 bytes to 160 bytes in Oct 2015.
- assertSize(t, InodeNew{}, 160)
+ var iNew InodeNew
+ assertSize(t, &iNew, 160)
}
// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 8bb327006..70948ebe9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
const (
// SbOffset is the absolute offset at which the superblock is placed.
SbOffset = 1024
@@ -38,6 +42,8 @@ const (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
type SuperBlock interface {
+ marshal.Marshallable
+
// InodesCount returns the total number of inodes in this filesystem.
InodesCount() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..4dc6080fb 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
@@ -17,6 +17,8 @@ package disklayout
// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
// RevLevel = DynamicRev and 64-bit feature is disabled.
+//
+// +marshal
type SuperBlock32Bit struct {
// We embed the old superblock struct here because the 32-bit version is just
// an extension of the old version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..2c9039327 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
@@ -19,6 +19,8 @@ package disklayout
// 1024 bytes (smallest possible block size) and hence the superblock always
// fits in no more than one data block. Should only be used when the 64-bit
// feature is set.
+//
+// +marshal
type SuperBlock64Bit struct {
// We embed the 32-bit struct here because 64-bit version is just an extension
// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..e4709f23c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
@@ -16,6 +16,8 @@ package disklayout
// SuperBlockOld implements SuperBlock and represents the old version of the
// superblock struct. Should be used only if RevLevel = OldRev.
+//
+// +marshal
type SuperBlockOld struct {
InodesCountRaw uint32
BlocksCountLo uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..b734b6987 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
@@ -21,7 +21,10 @@ import (
// TestSuperBlockSize tests that the superblock structs are of the correct
// size.
func TestSuperBlockSize(t *testing.T) {
- assertSize(t, SuperBlockOld{}, 84)
- assertSize(t, SuperBlock32Bit{}, 336)
- assertSize(t, SuperBlock64Bit{}, 1024)
+ var sbOld SuperBlockOld
+ assertSize(t, &sbOld, 84)
+ var sb32 SuperBlock32Bit
+ assertSize(t, &sb32, 336)
+ var sb64 SuperBlock64Bit
+ assertSize(t, &sb64, 1024)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..a4bc08411 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
@@ -18,13 +18,13 @@ import (
"reflect"
"testing"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
)
-func assertSize(t *testing.T, v interface{}, want uintptr) {
+func assertSize(t *testing.T, v marshal.Marshallable, want int) {
t.Helper()
- if got := binary.Size(v); got != want {
+ if got := v.SizeBytes(); got != want {
t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
}
}
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 08ffc2834..aca258d40 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,6 +34,8 @@ import (
const Name = "ext"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Compiles only if FilesystemType implements vfs.FilesystemType.
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..0989558cd 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,7 +71,11 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index c36225a7c..778460107 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -18,12 +18,13 @@ import (
"io"
"sort"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
type extentFile struct {
regFile regularFile
@@ -58,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) {
func (f *extentFile) buildExtTree() error {
rootNodeData := f.regFile.inode.diskInode.Data()
- binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
+ f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize])
// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
if f.root.Header.NumEntries > 4 {
@@ -77,7 +78,7 @@ func (f *extentFile) buildExtTree() error {
// Internal node.
curEntry = &disklayout.ExtentIdx{}
}
- binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
+ curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize])
f.root.Entries[i].Entry = curEntry
}
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index cd10d46ee..985f76ac0 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -21,7 +21,6 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
)
@@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
// writeTree writes the tree represented by `root` to the inode and disk. It
// also writes random file data on disk.
func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
- rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+ rootData := in.diskInode.Data()
+ root.Header.MarshalBytes(rootData)
+ off := root.Header.SizeBytes()
for _, ep := range root.Entries {
- rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+ ep.Entry.MarshalBytes(rootData[off:])
+ off += ep.Entry.SizeBytes()
}
- copy(in.diskInode.Data(), rootData)
-
var fileData []byte
for _, ep := range root.Entries {
if root.Header.Height == 0 {
@@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl
// writeTreeToDisk is the recursive step for writeTree which writes the tree
// on the disk only. Also writes random file data on disk.
func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
- nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+ nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:]
+ curNode.Node.Header.MarshalBytes(nodeData)
+ off := curNode.Node.Header.SizeBytes()
for _, ep := range curNode.Node.Entries {
- nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+ ep.Entry.MarshalBytes(nodeData[off:])
+ off += ep.Entry.SizeBytes()
}
- copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
-
var fileData []byte
for _, ep := range curNode.Node.Entries {
if curNode.Node.Header.Height == 0 {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 8565d1a66..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
)
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// dev represents the underlying fs device. It does not require protection
// because io.ReaderAt permits concurrent read calls to it. It translates to
@@ -490,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
_, inode, err := fs.walk(ctx, rp, false)
if err != nil {
@@ -504,8 +506,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
@@ -513,8 +515,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
@@ -522,8 +524,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
@@ -531,8 +533,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 30636cf66..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
// |-- regular--
// |-- extent file
// |-- block map file
+//
+// +stateify savable
type inode struct {
// refs is a reference count. refs is accessed using atomic memory operations.
refs int64
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index e73e740d6..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -31,6 +31,8 @@ import (
// regularFile represents a regular file's inode. This too follows the
// inheritance pattern prevelant in the vfs layer described in
// pkg/sentry/vfs/README.md.
+//
+// +stateify savable
type regularFile struct {
inode inode
@@ -67,6 +69,8 @@ func (in *inode) isRegular() bool {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type regularFileFD struct {
fileDescription
vfs.LockFD
@@ -75,7 +79,7 @@ type regularFileFD struct {
off int64
// offMu serializes operations that may mutate off.
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
)
// symlink represents a symlink inode.
+//
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
@@ -61,9 +63,11 @@ func (in *inode) isSymlink() bool {
return ok
}
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
// vfs.FileDescriptionImpl. which may only be used if open options contains
// O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
type symlinkFD struct {
fileDescription
vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index d8b728f8c..58ef7b9b8 100644
--- a/pkg/sentry/fsimpl/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -17,21 +17,21 @@ package ext
import (
"io"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// readFromDisk performs a binary read from disk into the given struct from
// the absolute offset provided.
-func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
- n := binary.Size(v)
+func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error {
+ n := v.SizeBytes()
buf := make([]byte, n)
if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
return syserror.EIO
}
- binary.Unmarshal(buf, binary.LittleEndian, v)
+ v.UnmarshalBytes(buf)
return nil
}
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 53a4f3012..045d7ab08 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -30,19 +30,26 @@ go_library(
name = "fuse",
srcs = [
"connection.go",
+ "connection_control.go",
"dev.go",
+ "directory.go",
+ "file.go",
"fusefs.go",
- "init.go",
"inode_refs.go",
+ "read_write.go",
"register.go",
+ "regular_file.go",
"request_list.go",
+ "request_response.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
"//pkg/refs",
+ "//pkg/safemem",
"//pkg/sentry/fsimpl/devtmpfs",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
@@ -52,7 +59,6 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -60,10 +66,15 @@ go_library(
go_test(
name = "fuse_test",
size = "small",
- srcs = ["dev_test.go"],
+ srcs = [
+ "connection_test.go",
+ "dev_test.go",
+ "utils_test.go",
+ ],
library = ":fuse",
deps = [
"//pkg/abi/linux",
+ "//pkg/marshal",
"//pkg/sentry/fsimpl/testutil",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
@@ -71,6 +82,5 @@ go_test(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index 6df2728ab..8ccda1264 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,31 +15,17 @@
package fuse
import (
- "errors"
- "fmt"
"sync"
- "sync/atomic"
- "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
const (
// fuseDefaultMaxBackground is the default value for MaxBackground.
fuseDefaultMaxBackground = 12
@@ -52,43 +38,39 @@ const (
fuseDefaultMaxPagesPerReq = 32
)
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
//
-// +stateify savable
-type Request struct {
- requestEntry
-
- id linux.FUSEOpID
- hdr *linux.FUSEHeaderIn
- data []byte
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
//
// +stateify savable
-type Response struct {
- opcode linux.FUSEOpcode
- hdr linux.FUSEHeaderOut
- data []byte
-}
-
-// connection is the struct by which the sentry communicates with the FUSE server daemon.
type connection struct {
fd *DeviceFD
+ // mu protects access to struct memebers.
+ mu sync.Mutex `state:"nosave"`
+
+ // attributeVersion is the version of connection's attributes.
+ attributeVersion uint64
+
+ // We target FUSE 7.23.
// The following FUSE_INIT flags are currently unsupported by this implementation:
- // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
// - FUSE_EXPORT_SUPPORT
- // - FUSE_HANDLE_KILLPRIV
// - FUSE_POSIX_LOCKS: requires POSIX locks
// - FUSE_FLOCK_LOCKS: requires POSIX locks
// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
- // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
// - FUSE_ASYNC_DIO
- // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+ // - FUSE_PARALLEL_DIROPS (7.25)
+ // - FUSE_HANDLE_KILLPRIV (7.26)
+ // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+ // - FUSE_ABORT_ERROR (7.27)
+ // - FUSE_CACHE_SYMLINKS (7.28)
+ // - FUSE_NO_OPENDIR_SUPPORT (7.29)
+ // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+ // - FUSE_MAP_ALIGNMENT (7.31)
// initialized after receiving FUSE_INIT reply.
// Until it's set, suspend sending FUSE requests.
@@ -96,11 +78,7 @@ type connection struct {
initialized int32
// initializedChan is used to block requests before initialization.
- initializedChan chan struct{}
-
- // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
- // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
- blocked bool
+ initializedChan chan struct{} `state:".(bool)"`
// connected (connection established) when a new FUSE file system is created.
// Set to false when:
@@ -109,48 +87,55 @@ type connection struct {
// device release.
connected bool
- // aborted via sysfs.
- // TODO(gvisor.dev/issue/3185): abort all queued requests.
- aborted bool
-
// connInitError if FUSE_INIT encountered error (major version mismatch).
// Only set in INIT.
connInitError bool
// connInitSuccess if FUSE_INIT is successful.
// Only set in INIT.
- // Used for destory.
+ // Used for destory (not yet implemented).
connInitSuccess bool
- // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
-
- // NumberBackground is the number of requests in the background.
- numBackground uint16
+ // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+ // Set only if abortErr is true and via fuse control fs (not yet implemented).
+ // TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+ aborted bool
- // congestionThreshold for NumBackground.
- // Negotiated in FUSE_INIT.
- congestionThreshold uint16
+ // numWating is the number of requests waiting to be
+ // sent to FUSE device or being processed by FUSE daemon.
+ numWaiting uint32
- // maxBackground is the maximum number of NumBackground.
- // Block connection when it is reached.
- // Negotiated in FUSE_INIT.
- maxBackground uint16
+ // Terminology note:
+ //
+ // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+ //
+ // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+ //
+ // We call the "background" requests in unix term as async requests.
+ // The "async requests" in unix term is our async requests that expect a reply,
+ // i.e. `!request.noReply`
- // numActiveBackground is the number of requests in background and has being marked as active.
- numActiveBackground uint16
+ // asyncMu protects the async request fields.
+ asyncMu sync.Mutex `state:"nosave"`
- // numWating is the number of requests waiting for completion.
- numWaiting uint32
+ // asyncNum is the number of async requests.
+ // Protected by asyncMu.
+ asyncNum uint16
- // TODO(gvisor.dev/issue/3185): BgQueue
- // some queue for background queued requests.
+ // asyncCongestionThreshold the number of async requests.
+ // Negotiated in FUSE_INIT as "CongestionThreshold".
+ // TODO(gvisor.dev/issue/3529): add congestion control.
+ // Protected by asyncMu.
+ asyncCongestionThreshold uint16
- // bgLock protects:
- // MaxBackground, CongestionThreshold, NumBackground,
- // NumActiveBackground, BgQueue, Blocked.
- bgLock sync.Mutex
+ // asyncNumMax is the maximum number of asyncNum.
+ // Connection blocks the async requests when it is reached.
+ // Negotiated in FUSE_INIT as "MaxBackground".
+ // Protected by asyncMu.
+ asyncNumMax uint16
// maxRead is the maximum size of a read buffer in in bytes.
+ // Initialized from a fuse fs parameter.
maxRead uint32
// maxWrite is the maximum size of a write buffer in bytes.
@@ -165,23 +150,20 @@ type connection struct {
// Negotiated and only set in INIT.
minor uint32
- // asyncRead if read pages asynchronously.
+ // atomicOTrunc is true when FUSE does not send a separate SETATTR request
+ // before open with O_TRUNC flag.
// Negotiated and only set in INIT.
- asyncRead bool
+ atomicOTrunc bool
- // abortErr is true if kernel need to return an unique read error after abort.
+ // asyncRead if read pages asynchronously.
// Negotiated and only set in INIT.
- abortErr bool
+ asyncRead bool
// writebackCache is true for write-back cache policy,
// false for write-through policy.
// Negotiated and only set in INIT.
writebackCache bool
- // cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
- // Negotiated and only set in INIT.
- cacheSymlinks bool
-
// bigWrites if doing multi-page cached writes.
// Negotiated and only set in INIT.
bigWrites bool
@@ -189,116 +171,86 @@ type connection struct {
// dontMask if filestestem does not apply umask to creation modes.
// Negotiated in INIT.
dontMask bool
+
+ // noOpen if FUSE server doesn't support open operation.
+ // This flag only influence performance, not correctness of the program.
+ noOpen bool
+}
+
+func (conn *connection) saveInitializedChan() bool {
+ select {
+ case <-conn.initializedChan:
+ return true // Closed.
+ default:
+ return false // Not closed.
+ }
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+ conn.initializedChan = make(chan struct{}, 1)
+ if closed {
+ close(conn.initializedChan)
+ }
}
// newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
// mount a FUSE filesystem.
fuseFD := fd.Impl().(*DeviceFD)
- fuseFD.mounted = true
// Create the writeBuf for the header to be stored in.
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
fuseFD.writeBuf = make([]byte, hdrLen)
fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
- fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+ fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
fuseFD.writeCursor = 0
return &connection{
- fd: fuseFD,
- maxBackground: fuseDefaultMaxBackground,
- congestionThreshold: fuseDefaultCongestionThreshold,
- maxPages: fuseDefaultMaxPagesPerReq,
- initializedChan: make(chan struct{}),
- connected: true,
- }, nil
-}
-
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
- // Unblock the requests sent before INIT.
- close(conn.initializedChan)
-
- // Close the channel first to avoid the non-atomic situation
- // where conn.initialized is true but there are
- // tasks being blocked on the channel.
- // And it prevents the newer tasks from gaining
- // unnecessary higher chance to be issued before the blocked one.
-
- atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
- return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
- conn.fd.mu.Lock()
- defer conn.fd.mu.Unlock()
- conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
- hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
- hdr := linux.FUSEHeaderIn{
- Len: uint32(hdrLen + payload.SizeBytes()),
- Opcode: opcode,
- Unique: conn.fd.nextOpID,
- NodeID: ino,
- UID: uint32(creds.EffectiveKUID),
- GID: uint32(creds.EffectiveKGID),
- PID: pid,
- }
-
- buf := make([]byte, hdr.Len)
- hdr.MarshalUnsafe(buf[:hdrLen])
- payload.MarshalUnsafe(buf[hdrLen:])
-
- return &Request{
- id: hdr.Unique,
- hdr: &hdr,
- data: buf,
+ fd: fuseFD,
+ asyncNumMax: fuseDefaultMaxBackground,
+ asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+ maxRead: opts.maxRead,
+ maxPages: fuseDefaultMaxPagesPerReq,
+ initializedChan: make(chan struct{}),
+ connected: true,
}, nil
}
-// Call makes a request to the server and blocks the invoking task until a
-// server responds with a response. Task should never be nil.
-// Requests will not be sent before the connection is initialized.
-// For async tasks, use CallAsync().
-func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
- // Block requests sent before connection is initalized.
- if !conn.Initialized() {
- if err := t.Block(conn.initializedChan); err != nil {
- return nil, err
- }
- }
-
- return conn.call(t, r)
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+ r.async = true
+ _, err := conn.Call(t, r)
+ return err
}
-// CallAsync makes an async (aka background) request.
-// Those requests either do not expect a response (e.g. release) or
-// the response should be handled by others (e.g. init).
-// Return immediately unless the connection is blocked (before initialization).
-// Async call example: init, release, forget, aio, interrupt.
+// Call makes a request to the server.
+// Block before the connection is initialized.
// When the Request is FUSE_INIT, it will not be blocked before initialization.
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
// Block requests sent before connection is initalized.
if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
if err := t.Block(conn.initializedChan); err != nil {
- return err
+ return nil, err
}
}
- // This should be the only place that invokes call() with a nil task.
- _, err := conn.call(nil, r)
- return err
-}
-
-// call makes a call without blocking checks.
-func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
if !conn.connected {
return nil, syserror.ENOTCONN
}
@@ -315,31 +267,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
return fut.resolve(t)
}
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
- errno := r.hdr.Error
- if errno >= 0 {
- return nil
- }
-
- sysErrNo := syscall.Errno(-errno)
- return error(sysErrNo)
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
- hdrLen := r.hdr.SizeBytes()
- haveDataLen := r.hdr.Len - uint32(hdrLen)
- wantDataLen := uint32(m.SizeBytes())
-
- if haveDataLen < wantDataLen {
- return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
- }
-
- m.UnmarshalUnsafe(r.data[hdrLen:])
- return nil
-}
-
// callFuture makes a request to the server and returns a future response.
// Call resolve() when the response needs to be fulfilled.
func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -358,11 +285,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// if there are always too many ongoing requests all the time. The
// supported maxActiveRequests setting should be really high to avoid this.
for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
- if t == nil {
- // Since there is no task that is waiting. We must error out.
- return nil, errors.New("FUSE request queue full")
- }
-
log.Infof("Blocking request %v from being queued. Too many active requests: %v",
r.id, conn.fd.numActiveRequests)
conn.fd.mu.Unlock()
@@ -378,9 +300,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// callFutureLocked makes a request to the server and returns a future response.
func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+ // Check connected again holding conn.mu.
+ conn.mu.Lock()
+ if !conn.connected {
+ conn.mu.Unlock()
+ // we checked connected before,
+ // this must be due to aborted connection.
+ return nil, syserror.ECONNABORTED
+ }
+ conn.mu.Unlock()
+
conn.fd.queue.PushBack(r)
- conn.fd.numActiveRequests += 1
- fut := newFutureResponse(r.hdr.Opcode)
+ conn.fd.numActiveRequests++
+ fut := newFutureResponse(r)
conn.fd.completions[r.id] = fut
// Signal the readers that there is something to read.
@@ -388,50 +320,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
return fut, nil
}
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
- opcode linux.FUSEOpcode
- ch chan struct{}
- hdr *linux.FUSEHeaderOut
- data []byte
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
- return &futureResponse{
- opcode: opcode,
- ch: make(chan struct{}),
- }
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
- // If there is no Task associated with this request - then we don't try to resolve
- // the response. Instead, the task writing the response (proxy to the server) will
- // process the response on our behalf.
- if t == nil {
- log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
- return nil, nil
- }
-
- if err := t.Block(f.ch); err != nil {
- return nil, err
- }
-
- return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
- return &Response{
- opcode: f.opcode,
- hdr: *f.hdr,
- data: f.data,
- }
-}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go
index 779c2bd3f..bfde78559 100644
--- a/pkg/sentry/fsimpl/fuse/init.go
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -15,7 +15,11 @@
package fuse
import (
+ "sync/atomic"
+ "syscall"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -29,9 +33,10 @@ const (
// Follow the same behavior as unix fuse implementation.
fuseMaxTimeGranNs = 1000000000
- // Minimum value for MaxWrite.
+ // Minimum value for MaxWrite and MaxRead.
// Follow the same behavior as unix fuse implementation.
fuseMinMaxWrite = 4096
+ fuseMinMaxRead = 4096
// Temporary default value for max readahead, 128kb.
fuseDefaultMaxReadahead = 131072
@@ -49,6 +54,26 @@ var (
MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
)
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+ // Unblock the requests sent before INIT.
+ close(conn.initializedChan)
+
+ // Close the channel first to avoid the non-atomic situation
+ // where conn.initialized is true but there are
+ // tasks being blocked on the channel.
+ // And it prevents the newer tasks from gaining
+ // unnecessary higher chance to be issued before the blocked one.
+
+ atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+ return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
// InitSend sends a FUSE_INIT request.
func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
in := linux.FUSEInitIn{
@@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
}
// InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
if err := res.Error(); err != nil {
return err
}
- var out linux.FUSEInitOut
- if err := res.UnmarshalPayload(&out); err != nil {
+ initRes := fuseInitRes{initLen: res.DataLen()}
+ if err := res.UnmarshalPayload(&initRes); err != nil {
return err
}
- return conn.initProcessReply(&out, hasSysAdminCap)
+ return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
}
// Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+ // No matter error or not, always set initialzied.
+ // to unblock the blocked requests.
+ defer conn.SetInitialized()
+
// No support for old major fuse versions.
if out.Major != linux.FUSE_KERNEL_VERSION {
conn.connInitError = true
-
- // Set the connection as initialized and unblock the blocked requests
- // (i.e. return error for them).
- conn.SetInitialized()
-
return nil
}
@@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.connInitSuccess = true
conn.minor = out.Minor
- // No support for limits before minor version 13.
- if out.Minor >= 13 {
- conn.bgLock.Lock()
-
- if out.MaxBackground > 0 {
- conn.maxBackground = out.MaxBackground
-
- if !hasSysAdminCap &&
- conn.maxBackground > MaxUserBackgroundRequest {
- conn.maxBackground = MaxUserBackgroundRequest
- }
- }
-
- if out.CongestionThreshold > 0 {
- conn.congestionThreshold = out.CongestionThreshold
-
- if !hasSysAdminCap &&
- conn.congestionThreshold > MaxUserCongestionThreshold {
- conn.congestionThreshold = MaxUserCongestionThreshold
- }
- }
-
- conn.bgLock.Unlock()
+ // No support for negotiating MaxWrite before minor version 5.
+ if out.Minor >= 5 {
+ conn.maxWrite = out.MaxWrite
+ } else {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+ if conn.maxWrite < fuseMinMaxWrite {
+ conn.maxWrite = fuseMinMaxWrite
}
// No support for the following flags before minor version 6.
@@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
- conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
- conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
@@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
}
}
- // No support for negotiating MaxWrite before minor version 5.
- if out.Minor >= 5 {
- conn.maxWrite = out.MaxWrite
- } else {
- conn.maxWrite = fuseMinMaxWrite
+ // No support for limits before minor version 13.
+ if out.Minor >= 13 {
+ conn.asyncMu.Lock()
+
+ if out.MaxBackground > 0 {
+ conn.asyncNumMax = out.MaxBackground
+
+ if !hasSysAdminCap &&
+ conn.asyncNumMax > MaxUserBackgroundRequest {
+ conn.asyncNumMax = MaxUserBackgroundRequest
+ }
+ }
+
+ if out.CongestionThreshold > 0 {
+ conn.asyncCongestionThreshold = out.CongestionThreshold
+
+ if !hasSysAdminCap &&
+ conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+ conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+ }
+ }
+
+ conn.asyncMu.Unlock()
}
- if conn.maxWrite < fuseMinMaxWrite {
- conn.maxWrite = fuseMinMaxWrite
+
+ return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+ conn.mu.Lock()
+ conn.asyncMu.Lock()
+
+ if !conn.connected {
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+ conn.fd.mu.Unlock()
+ return
}
- // Set connection as initialized and unblock the requests
- // issued before init.
- conn.SetInitialized()
+ conn.connected = false
- return nil
+ // Empty the `fd.queue` that holds the requests
+ // not yet read by the FUSE daemon yet.
+ // These are a subset of the requests in `fuse.completion` map.
+ for !conn.fd.queue.Empty() {
+ req := conn.fd.queue.Front()
+ conn.fd.queue.Remove(req)
+ }
+
+ var terminate []linux.FUSEOpID
+
+ // 2. Collect the requests have not been sent to FUSE daemon,
+ // or have not received a reply.
+ for unique := range conn.fd.completions {
+ terminate = append(terminate, unique)
+ }
+
+ // Release locks to avoid deadlock.
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+
+ // 1. The requets blocked before initialization.
+ // Will reach call() `connected` check and return.
+ if !conn.Initialized() {
+ conn.SetInitialized()
+ }
+
+ // 2. Terminate the requests collected above.
+ // Set ECONNABORTED error.
+ // sendError() will remove them from `fd.completion` map.
+ // Will enter the path of a normally received error.
+ for _, toTerminate := range terminate {
+ conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+ }
+
+ // 3. The requests not yet written to FUSE device.
+ // Early terminate.
+ // Will reach callFutureLocked() `connected` check and return.
+ close(conn.fd.fullQueueCh)
+
+ // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
}
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "math/rand"
+ "syscall"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+
+ conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ select {
+ case <-conn.initializedChan:
+ t.Fatalf("initializedChan should be blocking before SetInitialized")
+ default:
+ }
+
+ conn.SetInitialized()
+
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+}
+
+func TestConnectionAbort(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+ creds := auth.CredentialsFromContext(s.Ctx)
+ task := kernel.TaskFromContext(s.Ctx)
+
+ const numRequests uint64 = 256
+
+ conn, _, err := newTestConnection(s, k, numRequests)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ testObj := &testPayload{
+ data: rand.Uint32(),
+ }
+
+ var futNormal []*futureResponse
+
+ for i := 0; i < int(numRequests); i++ {
+ req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ fut, err := conn.callFutureLocked(task, req)
+ if err != nil {
+ t.Fatalf("callFutureLocked failed: %v", err)
+ }
+ futNormal = append(futNormal, fut)
+ }
+
+ conn.Abort(s.Ctx)
+
+ // Abort should unblock the initialization channel.
+ // Note: no test requests are actually blocked on `conn.initializedChan`.
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+
+ // Abort will return ECONNABORTED error to unblocked requests.
+ for _, fut := range futNormal {
+ if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+ t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+ }
+ }
+
+ // After abort, Call() should return directly with ENOTCONN.
+ req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ _, err = conn.Call(task, req)
+ if err != syserror.ENOTCONN {
+ t.Fatalf("Incorrect error code received for Call() after connection aborted")
+ }
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index e522ff9a0..1b86a4b4c 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -32,6 +31,8 @@ import (
const fuseDevMinor = 229
// fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
type fuseDevice struct{}
// Open implements vfs.Device.Open.
@@ -50,15 +51,14 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
type DeviceFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
vfs.NoLockFD
- // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
- mounted bool
-
// nextOpID is used to create new requests.
nextOpID linux.FUSEOpID
@@ -83,7 +83,7 @@ type DeviceFD struct {
writeCursorFR *futureResponse
// mu protects all the queues, maps, buffers and cursors and nextOpID.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// waitQueue is used to notify interested parties when the device becomes
// readable or writable.
@@ -92,21 +92,36 @@ type DeviceFD struct {
// fullQueueCh is a channel used to synchronize the readers with the writers.
// Writers (inbound requests to the filesystem) block if there are too many
// unprocessed in-flight requests.
- fullQueueCh chan struct{}
+ fullQueueCh chan struct{} `state:".(int)"`
// fs is the FUSE filesystem that this FD is being used for.
fs *filesystem
}
+func (fd *DeviceFD) saveFullQueueCh() int {
+ return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+ fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
- fd.fs.conn.connected = false
+func (fd *DeviceFD) Release(ctx context.Context) {
+ if fd.fs != nil {
+ fd.fs.conn.mu.Lock()
+ fd.fs.conn.connected = false
+ fd.fs.conn.mu.Unlock()
+
+ fd.fs.VFSFilesystem().DecRef(ctx)
+ fd.fs = nil
+ }
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -116,10 +131,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+ return 0, syserror.ENODEV
+ }
+
// We require that any Read done on this filesystem have a sane minimum
// read buffer. It must have the capacity for the fixed parts of any request
// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -143,58 +164,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
}
// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- if fd.queue.Empty() {
- return 0, syserror.ErrWouldBlock
- }
+ var req *Request
- var readCursor uint32
- var bytesRead int64
- for {
- req := fd.queue.Front()
- if dst.NumBytes() < int64(req.hdr.Len) {
- // The request is too large. Cannot process it. All requests must be smaller than the
- // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
- // handshake.
- errno := -int32(syscall.EIO)
- if req.hdr.Opcode == linux.FUSE_SETXATTR {
- errno = -int32(syscall.E2BIG)
- }
+ // Find the first valid request.
+ // For the normal case this loop only execute once.
+ for !fd.queue.Empty() {
+ req = fd.queue.Front()
- // Return the error to the calling task.
- if err := fd.sendError(ctx, errno, req); err != nil {
- return 0, err
- }
+ if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+ break
+ }
- // We're done with this request.
- fd.queue.Remove(req)
+ // The request is too large. Cannot process it. All requests must be smaller than the
+ // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+ // handshake.
+ errno := -int32(syscall.EIO)
+ if req.hdr.Opcode == linux.FUSE_SETXATTR {
+ errno = -int32(syscall.E2BIG)
+ }
- // Restart the read as this request was invalid.
- log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
- return fd.readLocked(ctx, dst, opts)
+ // Return the error to the calling task.
+ if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+ return 0, err
}
- n, err := dst.CopyOut(ctx, req.data[readCursor:])
+ // We're done with this request.
+ fd.queue.Remove(req)
+ req = nil
+ }
+
+ if req == nil {
+ return 0, syserror.ErrWouldBlock
+ }
+
+ // We already checked the size: dst must be able to fit the whole request.
+ // Now we write the marshalled header, the payload,
+ // and the potential additional payload
+ // to the user memory IOSequence.
+
+ n, err := dst.CopyOut(ctx, req.data)
+ if err != nil {
+ return 0, err
+ }
+ if n != len(req.data) {
+ return 0, syserror.EIO
+ }
+
+ if req.hdr.Opcode == linux.FUSE_WRITE {
+ written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
if err != nil {
return 0, err
}
- readCursor += uint32(n)
- bytesRead += int64(n)
-
- if readCursor >= req.hdr.Len {
- // Fully done with this req, remove it from the queue.
- fd.queue.Remove(req)
- break
+ if written != len(req.payload) {
+ return 0, syserror.EIO
}
+ n += int(written)
}
- return bytesRead, nil
+ // Fully done with this req, remove it from the queue.
+ fd.queue.Remove(req)
+
+ // Remove noReply ones from map of requests expecting a reply.
+ if req.noReply {
+ fd.numActiveRequests -= 1
+ delete(fd.completions, req.hdr.Unique)
+ }
+
+ return int64(n), nil
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -211,10 +256,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ return 0, syserror.ENODEV
+ }
+
var cn, n int64
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
@@ -276,7 +326,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
fut, ok := fd.completions[hdr.Unique]
if !ok {
- // Server sent us a response for a request we never sent?
+ // Server sent us a response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted), an unlikely event.
return 0, syserror.EINVAL
}
@@ -307,8 +358,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
// Readiness implements vfs.FileDescriptionImpl.Readiness.
func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
var ready waiter.EventMask
- ready |= waiter.EventOut // FD is always writable
+
+ if fd.fs.umounted {
+ ready |= waiter.EventErr
+ return ready & mask
+ }
+
+ // FD is always writable.
+ ready |= waiter.EventOut
if !fd.queue.Empty() {
// Have reqs available, FD is readable.
ready |= waiter.EventIn
@@ -330,7 +396,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -338,59 +404,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
}
// sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
- // See if the running task need to perform some action before returning.
- // Since we just finished writing the future, we can be sure that
- // getResponse generates a populated response.
- if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
- return err
- }
+ // Signal the task waiting on a response if any.
+ defer close(fut.ch)
// Signal that the queue is no longer full.
select {
case fd.fullQueueCh <- struct{}{}:
default:
}
- fd.numActiveRequests -= 1
+ fd.numActiveRequests--
+
+ if fut.async {
+ return fd.asyncCallBack(ctx, fut.getResponse())
+ }
- // Signal the task waiting on a response.
- close(fut.ch)
return nil
}
-// sendError sends an error response to the waiting task (if any).
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
// Return the error to the calling task.
outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
respHdr := linux.FUSEHeaderOut{
Len: outHdrLen,
Error: errno,
- Unique: req.hdr.Unique,
+ Unique: unique,
}
fut, ok := fd.completions[respHdr.Unique]
if !ok {
- // Server sent us a response for a request we never sent?
+ // A response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted).
return syserror.EINVAL
}
delete(fd.completions, respHdr.Unique)
fut.hdr = &respHdr
- if err := fd.sendResponse(ctx, fut); err != nil {
- return err
- }
-
- return nil
+ return fd.sendResponse(ctx, fut)
}
-// noReceiverAction has the calling kernel.Task do some action if its known that no
-// receiver is going to be waiting on the future channel. This is to be used by:
-// FUSE_INIT.
-func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
- if r.opcode == linux.FUSE_INIT {
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+ switch r.opcode {
+ case linux.FUSE_INIT:
creds := auth.CredentialsFromContext(ctx)
rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+ // TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
}
return nil
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..5986133e9 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -16,7 +16,6 @@ package fuse
import (
"fmt"
- "io"
"math/rand"
"testing"
@@ -28,17 +27,12 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// echoTestOpcode is the Opcode used during testing. The server used in tests
// will simply echo the payload back with the appropriate headers.
const echoTestOpcode linux.FUSEOpcode = 1000
-type testPayload struct {
- data uint32
-}
-
// TestFUSECommunication tests that the communication layer between the Sentry and the
// FUSE server daemon works as expected.
func TestFUSECommunication(t *testing.T) {
@@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
}
}
}
-
-func setup(t *testing.T) *testutil.System {
- k, err := testutil.Boot()
- if err != nil {
- t.Fatalf("Error creating kernel: %v", err)
- }
-
- ctx := k.SupervisorContext()
- creds := auth.CredentialsFromContext(ctx)
-
- k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserList: true,
- AllowUserMount: true,
- })
-
- mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
- if err != nil {
- t.Fatalf("NewMountNamespace(): %v", err)
- }
-
- return testutil.NewSystem(ctx, t, k.VFS(), mntns)
-}
-
-// newTestConnection creates a fuse connection that the sentry can communicate with
-// and the FD for the server to communicate with.
-func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
- vfsObj := &vfs.VirtualFilesystem{}
- fuseDev := &DeviceFD{}
-
- if err := vfsObj.Init(system.Ctx); err != nil {
- return nil, nil, err
- }
-
- vd := vfsObj.NewAnonVirtualDentry("genCountFD")
- defer vd.DecRef(system.Ctx)
- if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
- return nil, nil, err
- }
-
- fsopts := filesystemOptions{
- maxActiveRequests: maxActiveRequests,
- }
- fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
- if err != nil {
- return nil, nil, err
- }
-
- return fs.conn, &fuseDev.vfsfd, nil
-}
-
-// SizeBytes implements marshal.Marshallable.SizeBytes.
-func (t *testPayload) SizeBytes() int {
- return 4
-}
-
-// MarshalBytes implements marshal.Marshallable.MarshalBytes.
-func (t *testPayload) MarshalBytes(dst []byte) {
- usermem.ByteOrder.PutUint32(dst[:4], t.data)
-}
-
-// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
-func (t *testPayload) UnmarshalBytes(src []byte) {
- *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
-}
-
-// Packed implements marshal.Marshallable.Packed.
-func (t *testPayload) Packed() bool {
- return true
-}
-
-// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
-func (t *testPayload) MarshalUnsafe(dst []byte) {
- t.MarshalBytes(dst)
-}
-
-// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
-func (t *testPayload) UnmarshalUnsafe(src []byte) {
- t.UnmarshalBytes(src)
-}
-
-// CopyOutN implements marshal.Marshallable.CopyOutN.
-func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
- panic("not implemented")
-}
-
-// CopyOut implements marshal.Marshallable.CopyOut.
-func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
- panic("not implemented")
-}
-
-// CopyIn implements marshal.Marshallable.CopyIn.
-func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
- panic("not implemented")
-}
-
-// WriteTo implements io.WriterTo.WriteTo.
-func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
- panic("not implemented")
-}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+ fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+ fusefs := dir.inode().fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEReadIn{
+ Fh: dir.Fh,
+ Offset: uint64(atomic.LoadInt64(&dir.off)),
+ Size: linux.FUSE_PAGE_SIZE,
+ Flags: dir.statusFlags(),
+ }
+
+ // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := fusefs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ var out linux.FUSEDirents
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ for _, fuseDirent := range out.Dirents {
+ nextOff := int64(fuseDirent.Meta.Off)
+ dirent := vfs.Dirent{
+ Name: fuseDirent.Name,
+ Type: uint8(fuseDirent.Meta.Type),
+ Ino: fuseDirent.Meta.Ino,
+ NextOff: nextOff,
+ }
+
+ if err := callback.Handle(dirent); err != nil {
+ return err
+ }
+ atomic.StoreInt64(&dir.off, nextOff)
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ // the file handle used in userspace.
+ Fh uint64
+
+ // Nonseekable is indicate cannot perform seek on a file.
+ Nonseekable bool
+
+ // DirectIO suggest fuse to use direct io operation.
+ DirectIO bool
+
+ // OpenFlag is the flag returned by open.
+ OpenFlag uint32
+
+ // off is the file offset.
+ off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+ return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+ return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+ // no need to release if FUSE server doesn't implement Open.
+ conn := fd.inode().fs.conn
+ if conn.noOpen {
+ return
+ }
+
+ in := linux.FUSEReleaseIn{
+ Fh: fd.Fh,
+ Flags: fd.statusFlags(),
+ }
+ // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+ var opcode linux.FUSEOpcode
+ if fd.inode().Mode().IsDir() {
+ opcode = linux.FUSE_RELEASEDIR
+ } else {
+ opcode = linux.FUSE_RELEASE
+ }
+ kernelTask := kernel.TaskFromContext(ctx)
+ // ignoring errors and FUSE server reply is analogous to Linux's behavior.
+ req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+ if err != nil {
+ // No way to invoke Call() with an errored request.
+ return
+ }
+ // The reply will be ignored since no callback is defined in asyncCallBack().
+ conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.filesystem()
+ inode := fd.inode()
+ return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ fs := fd.filesystem()
+ creds := auth.CredentialsFromContext(ctx)
+ return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 810819ae4..65786e42a 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,24 +16,36 @@
package fuse
import (
+ "math"
"strconv"
+ "sync"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
)
// Name is the default filesystem name.
const Name = "fuse"
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
+// +stateify savable
type filesystemOptions struct {
// userID specifies the numeric uid of the mount owner.
// This option should not be specified by the filesystem owner.
@@ -56,9 +68,16 @@ type filesystemOptions struct {
// exist at any time. Any further requests will block when trying to
// Call the server.
maxActiveRequests uint64
+
+ // maxRead is the max number of bytes to read,
+ // specified as "max_read" in fs parameters.
+ // If not specified by user, use math.MaxUint32 as default value.
+ maxRead uint32
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
devMinor uint32
@@ -69,6 +88,9 @@ type filesystem struct {
// opts is the options the fusefs is initialized with.
opts *filesystemOptions
+
+ // umounted is true if filesystem.Release() has been called.
+ umounted bool
}
// Name implements vfs.FilesystemType.Name.
@@ -142,14 +164,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Set the maxInFlightRequests option.
fsopts.maxActiveRequests = maxActiveRequestsDefault
+ if maxReadStr, ok := mopts["max_read"]; ok {
+ delete(mopts, "max_read")
+ maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+ return nil, nil, syserror.EINVAL
+ }
+ if maxRead < fuseMinMaxRead {
+ maxRead = fuseMinMaxRead
+ }
+ fsopts.maxRead = uint32(maxRead)
+ } else {
+ fsopts.maxRead = math.MaxUint32
+ }
+
// Check for unparsed options.
if len(mopts) != 0 {
- log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+ log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
return nil, nil, syserror.EINVAL
}
// Create a new FUSE filesystem.
- fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+ fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
if err != nil {
log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
return nil, nil, err
@@ -165,26 +202,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
// root is the fusefs root directory.
- root := fs.newInode(creds, fsopts.rootMode)
+ root := fs.newRootInode(creds, fsopts.rootMode)
return fs.VFSFilesystem(), root.VFSDentry(), nil
}
-// NewFUSEFilesystem creates a new FUSE filesystem.
-func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
- fs := &filesystem{
- devMinor: devMinor,
- opts: opts,
- }
-
- conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+ conn, err := newFUSEConnection(ctx, device, opts)
if err != nil {
log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
return nil, syserror.EINVAL
}
- fs.conn = conn
fuseFD := device.Impl().(*DeviceFD)
+
+ fs := &filesystem{
+ devMinor: devMinor,
+ opts: opts,
+ conn: conn,
+ }
+
+ fs.VFSFilesystem().IncRef()
fuseFD.fs = fs
return fs, nil
@@ -192,11 +231,22 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
+ fs.conn.fd.mu.Lock()
+
+ fs.umounted = true
+ fs.conn.Abort(ctx)
+ // Notify all the waiters on this fd.
+ fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+ fs.conn.fd.mu.Unlock()
+
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
fs.Filesystem.Release(ctx)
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
inodeRefs
kernfs.InodeAttrs
@@ -205,14 +255,50 @@ type inode struct {
kernfs.InodeNotSymlink
kernfs.OrderedChildren
+ dentry kernfs.Dentry
+
+ // the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // metaDataMu protects the metadata of this inode.
+ metadataMu sync.Mutex
+
+ nodeID uint64
+
locks vfs.FileLocks
- dentry kernfs.Dentry
+ // size of the file.
+ size uint64
+
+ // attributeVersion is the version of inode's attributes.
+ attributeVersion uint64
+
+ // attributeTime is the remaining vaild time of attributes.
+ attributeTime uint64
+
+ // version of the inode.
+ version uint64
+
+ // link is result of following a symbolic link.
+ link string
+}
+
+func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+ i := &inode{fs: fs}
+ i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
+ i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ i.EnableLeakCheck()
+ i.dentry.Init(i)
+ i.nodeID = 1
+
+ return &i.dentry
}
-func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
- i := &inode{}
- i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry {
+ i := &inode{fs: fs, nodeID: nodeID}
+ creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+ i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+ atomic.StoreUint64(&i.size, attr.Size)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
i.EnableLeakCheck()
i.dentry.Init(i)
@@ -221,14 +307,292 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke
}
// Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
- SeekEnd: kernfs.SeekEndStaticEntries,
- })
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ isDir := i.InodeAttrs.Mode().IsDir()
+ // return error if specified to open directory but inode is not a directory.
+ if !isDir && opts.Mode.IsDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+ return nil, syserror.EOVERFLOW
+ }
+
+ var fd *fileDescription
+ var fdImpl vfs.FileDescriptionImpl
+ if isDir {
+ directoryFD := &directoryFD{}
+ fd = &(directoryFD.fileDescription)
+ fdImpl = directoryFD
+ } else {
+ regularFD := &regularFileFD{}
+ fd = &(regularFD.fileDescription)
+ fdImpl = regularFD
+ }
+ // FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+ fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+ // Only send open request when FUSE server support open or is opening a directory.
+ if !i.fs.conn.noOpen || isDir {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+ return nil, syserror.EINVAL
+ }
+
+ // Build the request.
+ var opcode linux.FUSEOpcode
+ if isDir {
+ opcode = linux.FUSE_OPENDIR
+ } else {
+ opcode = linux.FUSE_OPEN
+ }
+
+ in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+ if !i.fs.conn.atomicOTrunc {
+ in.Flags &= ^uint32(linux.O_TRUNC)
+ }
+
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+ if err != nil {
+ return nil, err
+ }
+
+ // Send the request and receive the reply.
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return nil, err
+ }
+ if err := res.Error(); err == syserror.ENOSYS && !isDir {
+ i.fs.conn.noOpen = true
+ } else if err != nil {
+ return nil, err
+ } else {
+ out := linux.FUSEOpenOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return nil, err
+ }
+
+ // Process the reply.
+ fd.OpenFlag = out.OpenFlag
+ if isDir {
+ fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+ }
+
+ fd.Fh = out.Fh
+ }
+ }
+
+ // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+ fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+ fdOptions := &vfs.FileDescriptionOptions{}
+ if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+ fdOptions.DenyPRead = true
+ fdOptions.DenyPWrite = true
+ fd.Nonseekable = true
+ }
+
+ // If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+ // and O_TRUNC is set, update the inode's version number and clean existing data
+ // by setting the file size to 0.
+ if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+ i.fs.conn.mu.Lock()
+ i.fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, 0)
+ i.fs.conn.mu.Unlock()
+ i.attributeTime = 0
+ }
+
+ if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
+ in := linux.FUSELookupIn{Name: name}
+ return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return offset, nil
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (*inode) Valid(ctx context.Context) bool {
+ return true
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ in := linux.FUSECreateIn{
+ CreateMeta: linux.FUSECreateMeta{
+ Flags: opts.Flags,
+ Mode: uint32(opts.Mode) | linux.S_IFREG,
+ Umask: uint32(kernelTask.FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*kernfs.Dentry, error) {
+ in := linux.FUSEMknodIn{
+ MknodMeta: linux.FUSEMknodMeta{
+ Mode: uint32(opts.Mode),
+ Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.Dentry, error) {
+ in := linux.FUSESymLinkIn{
+ Name: name,
+ Target: target,
+ }
+ return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) error {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return syserror.EINVAL
+ }
+ in := linux.FUSEUnlinkIn{Name: name}
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
+ if err != nil {
+ return err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return err
+ }
+ // only return error, discard res.
+ if err := res.Error(); err != nil {
+ return err
+ }
+ return i.dentry.RemoveChildLocked(name, child)
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
+ in := linux.FUSEMkdirIn{
+ MkdirMeta: linux.FUSEMkdirMeta{
+ Mode: uint32(opts.Mode),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) error {
+ fusefs := i.fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSERmDirIn{Name: name}
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := i.fs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ return i.dentry.RemoveChildLocked(name, child)
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*kernfs.Dentry, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+ if err != nil {
+ return nil, err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
if err != nil {
return nil, err
}
- return fd.VFSFileDescription(), nil
+ if err := res.Error(); err != nil {
+ return nil, err
+ }
+ out := linux.FUSEEntryOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return nil, err
+ }
+ if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+ return nil, syserror.EIO
+ }
+ child := i.fs.newInode(out.NodeID, out.Attr)
+ return child, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ path, err := i.Readlink(ctx, mnt)
+ return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+ if i.Mode().FileType()&linux.S_IFLNK == 0 {
+ return "", syserror.EINVAL
+ }
+ if len(i.link) == 0 {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+ return "", syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+ if err != nil {
+ return "", err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return "", err
+ }
+ i.link = string(res.data[res.hdr.SizeBytes():])
+ if !mnt.Options().ReadOnly {
+ i.attributeTime = 0
+ }
+ }
+ return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+ return linux.FUSEAttr{
+ Ino: i.Ino(),
+ Size: atomic.LoadUint64(&i.size),
+ Mode: uint32(i.Mode()),
+ }
}
// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -284,50 +648,92 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
return stat
}
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- fusefs := fs.Impl().(*filesystem)
- conn := fusefs.conn
- task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+ attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+ // TODO(gvisor.dev/issue/3679): send the request only if
+ // - invalid local cache for fields specified in the opts.Mask
+ // - forced update
+ // - i.attributeTime expired
+ // If local cache is still valid, return local cache.
+ // Currently we always send a request,
+ // and we always set the metadata with the new result,
+ // unless attributeVersion has changed.
+
+ task := kernel.TaskFromContext(ctx)
if task == nil {
log.Warningf("couldn't get kernel task from context")
- return linux.Statx{}, syserror.EINVAL
+ return linux.FUSEAttr{}, syserror.EINVAL
}
- var in linux.FUSEGetAttrIn
- // We don't set any attribute in the request, because in VFS2 fstat(2) will
- // finally be translated into vfs.FilesystemImpl.StatAt() (see
- // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
- // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
- req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+ creds := auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEGetAttrIn{
+ GetAttrFlags: flags,
+ Fh: fh,
+ }
+ req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
if err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
- res, err := conn.Call(task, req)
+ res, err := i.fs.conn.Call(task, req)
if err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
if err := res.Error(); err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
var out linux.FUSEGetAttrOut
if err := res.UnmarshalPayload(&out); err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
+ }
+
+ // Local version is newer, return the local one.
+ // Skip the update.
+ if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+ return i.getFUSEAttr(), nil
}
- // Set all metadata into kernfs.InodeAttrs.
- if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
- Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
}); err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ // Set the size if no error (after SetStat() check).
+ atomic.StoreUint64(&i.size, out.Attr.Size)
+
+ return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+ // Never need atime for internal purposes.
+ _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+ Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+ }, flags, fh)
+ return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+ if err != nil {
return linux.Statx{}, err
}
- return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+ return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *inode) DecRef(context.Context) {
i.inodeRefs.DecRef(i.Destroy)
}
@@ -337,3 +743,84 @@ func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, e
// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+ var fuseAttrMask uint32
+ maskMap := map[uint32]uint32{
+ linux.STATX_MODE: linux.FATTR_MODE,
+ linux.STATX_UID: linux.FATTR_UID,
+ linux.STATX_GID: linux.FATTR_GID,
+ linux.STATX_SIZE: linux.FATTR_SIZE,
+ linux.STATX_ATIME: linux.FATTR_ATIME,
+ linux.STATX_MTIME: linux.FATTR_MTIME,
+ linux.STATX_CTIME: linux.FATTR_CTIME,
+ }
+ for statxMask, fattrMask := range maskMap {
+ if mask&statxMask != 0 {
+ fuseAttrMask |= fattrMask
+ }
+ }
+ return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+ conn := i.fs.conn
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ log.Warningf("couldn't get kernel task from context")
+ return syserror.EINVAL
+ }
+
+ // We should retain the original file type when assigning new mode.
+ fileType := uint16(i.Mode()) & linux.S_IFMT
+ fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+ if useFh {
+ fattrMask |= linux.FATTR_FH
+ }
+ in := linux.FUSESetAttrIn{
+ Valid: fattrMask,
+ Fh: fh,
+ Size: opts.Stat.Size,
+ Atime: uint64(opts.Stat.Atime.Sec),
+ Mtime: uint64(opts.Stat.Mtime.Sec),
+ Ctime: uint64(opts.Stat.Ctime.Sec),
+ AtimeNsec: opts.Stat.Atime.Nsec,
+ MtimeNsec: opts.Stat.Mtime.Nsec,
+ CtimeNsec: opts.Stat.Ctime.Nsec,
+ Mode: uint32(fileType | opts.Stat.Mode),
+ UID: opts.Stat.UID,
+ GID: opts.Stat.GID,
+ }
+ req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+ out := linux.FUSEGetAttrOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+ }); err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..625d1547f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,242 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+ attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return nil, 0, syserror.EINVAL
+ }
+
+ // Round up to a multiple of page size.
+ readSize, _ := usermem.PageRoundUp(uint64(size))
+
+ // One request cannnot exceed either maxRead or maxPages.
+ maxPages := fs.conn.maxRead >> usermem.PageShift
+ if maxPages > uint32(fs.conn.maxPages) {
+ maxPages = uint32(fs.conn.maxPages)
+ }
+
+ var outs [][]byte
+ var sizeRead uint32
+
+ // readSize is a multiple of usermem.PageSize.
+ // Always request bytes as a multiple of pages.
+ pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEReadIn{
+ Fh: fd.Fh,
+ LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+ ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ Flags: fd.statusFlags(),
+ }
+
+ // This loop is intended for fragmented read where the bytes to read is
+ // larger than either the maxPages or maxRead.
+ // For the majority of reads with normal size, this loop should only
+ // execute once.
+ for pagesRead < pagesToRead {
+ pagesCanRead := pagesToRead - pagesRead
+ if pagesCanRead > maxPages {
+ pagesCanRead = maxPages
+ }
+
+ in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+ in.Size = pagesCanRead << usermem.PageShift
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3247): support async read.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return nil, 0, err
+ }
+ if err := res.Error(); err != nil {
+ return nil, 0, err
+ }
+
+ // Not enough bytes in response,
+ // either we reached EOF,
+ // or the FUSE server sends back a response
+ // that cannot even fit the hdr.
+ if len(res.data) <= res.hdr.SizeBytes() {
+ // We treat both case as EOF here for now
+ // since there is no reliable way to detect
+ // the over-short hdr case.
+ break
+ }
+
+ // Directly using the slice to avoid extra copy.
+ out := res.data[res.hdr.SizeBytes():]
+
+ outs = append(outs, out)
+ sizeRead += uint32(len(out))
+
+ pagesRead += pagesCanRead
+ }
+
+ defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+ // No bytes returned: offset >= EOF.
+ if len(outs) == 0 {
+ return nil, 0, io.EOF
+ }
+
+ return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+ // TODO(gvisor.dev/issue/3247): support async read.
+ // If this is called by an async read, correctly process it.
+ // May need to update the signature.
+
+ i := fd.inode()
+ // TODO(gvisor.dev/issue/1193): Invalidate or update atime.
+
+ // Reached EOF.
+ if sizeRead < size {
+ // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+ // Might need to update the buf to be returned from the Read().
+
+ // Update existing size.
+ newSize := off + uint64(sizeRead)
+ fs.conn.mu.Lock()
+ if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+ fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, newSize)
+ }
+ fs.conn.mu.Unlock()
+ }
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return 0, syserror.EINVAL
+ }
+
+ // One request cannnot exceed either maxWrite or maxPages.
+ maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+ if maxWrite > fs.conn.maxWrite {
+ maxWrite = fs.conn.maxWrite
+ }
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEWriteIn{
+ Fh: fd.Fh,
+ // TODO(gvisor.dev/issue/3245): file lock
+ LockOwner: 0,
+ // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+ WriteFlags: 0,
+ Flags: fd.statusFlags(),
+ }
+
+ var written uint32
+
+ // This loop is intended for fragmented write where the bytes to write is
+ // larger than either the maxWrite or maxPages or when bigWrites is false.
+ // Unless a small value for max_write is explicitly used, this loop
+ // is expected to execute only once for the majority of the writes.
+ for written < size {
+ toWrite := size - written
+
+ // Limit the write size to one page.
+ // Note that the bigWrites flag is obsolete,
+ // latest libfuse always sets it on.
+ if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+ toWrite = usermem.PageSize
+ }
+
+ // Limit the write size to maxWrite.
+ if toWrite > maxWrite {
+ toWrite = maxWrite
+ }
+
+ in.Offset = off + uint64(written)
+ in.Size = toWrite
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in)
+ if err != nil {
+ return 0, err
+ }
+
+ req.payload = data[written : written+toWrite]
+
+ // TODO(gvisor.dev/issue/3247): support async write.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return 0, err
+ }
+ if err := res.Error(); err != nil {
+ return 0, err
+ }
+
+ out := linux.FUSEWriteOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return 0, err
+ }
+
+ // Write more than requested? EIO.
+ if out.Size > toWrite {
+ return 0, syserror.EIO
+ }
+
+ written += out.Size
+
+ // Break if short write. Not necessarily an error.
+ if out.Size != toWrite {
+ break
+ }
+ }
+
+ return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset.
+ off int64
+ // offMu protects off.
+ offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ size := dst.NumBytes()
+ if size == 0 {
+ // Early return if count is 0.
+ return 0, nil
+ } else if size > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, syserror.EINVAL
+ }
+
+ // TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+ inode := fd.inode()
+
+ // Reading beyond EOF, update file size if outdated.
+ if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+ if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+ return 0, err
+ }
+ // If the offset after update is still too large, return error.
+ if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+ return 0, io.EOF
+ }
+ }
+
+ // Truncate the read with updated file size.
+ fileSize := atomic.LoadUint64(&inode.size)
+ if uint64(offset+size) > fileSize {
+ size = int64(fileSize) - offset
+ }
+
+ buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+ if err != nil {
+ return 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+ // store the bytes that were read ahead.
+
+ // Update the number of bytes to copy for short read.
+ if n < uint32(size) {
+ size = int64(n)
+ }
+
+ // Copy the bytes read to the dst.
+ // This loop is intended for fragmented reads.
+ // For the majority of reads, this loop only execute once.
+ var copied int64
+ for _, buffer := range buffers {
+ toCopy := int64(len(buffer))
+ if copied+toCopy > size {
+ toCopy = size - copied
+ }
+ cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+ if err != nil {
+ return 0, err
+ }
+ if int64(cp) != toCopy {
+ return 0, syserror.EIO
+ }
+ copied += toCopy
+ }
+
+ return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+ if offset < 0 {
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ inode := fd.inode()
+ inode.metadataMu.Lock()
+ defer inode.metadataMu.Unlock()
+
+ // If the file is opened with O_APPEND, update offset to file size.
+ // Note: since our Open() implements the interface of kernfs,
+ // and kernfs currently does not support O_APPEND, this will never
+ // be true before we switch out from kernfs.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Locking inode.metadataMu is sufficient for reading size
+ offset = int64(inode.size)
+ }
+
+ srclen := src.NumBytes()
+
+ if srclen > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+ if end := offset + srclen; end < offset {
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+
+ srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if srclen == 0 {
+ // Return before causing any side effects.
+ return 0, offset, nil
+ }
+
+ src = src.TakeFirst64(srclen)
+
+ // TODO(gvisor.dev/issue/3237): Add cache support:
+ // buffer cache. Ideally we write from src to our buffer cache first.
+ // The slice passed to fs.Write() should be a slice from buffer cache.
+ data := make([]byte, srclen)
+ // Reason for making a copy here: connection.Call() blocks on kerneltask,
+ // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+ // attemp to acquire the mm.activeMu lock as well -> deadlock.
+ // We must finish reading from the userspace memory before
+ // t.Block() deactivates it.
+ cp, err := src.CopyIn(ctx, data)
+ if err != nil {
+ return 0, offset, err
+ }
+ if int64(cp) != srclen {
+ return 0, offset, syserror.EIO
+ }
+
+ n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if n == 0 {
+ // We have checked srclen != 0 previously.
+ // If err == nil, then it's a short write and we return EIO.
+ return 0, offset, syserror.EIO
+ }
+
+ written = int64(n)
+ finalOff = offset + written
+
+ if finalOff > int64(inode.size) {
+ atomic.StoreUint64(&inode.size, uint64(finalOff))
+ atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+ }
+
+ return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+ marshal.StubMarshallable
+
+ // initOut contains the response from the FUSE server.
+ initOut linux.FUSEInitOut
+
+ // initLen is the total length of bytes of the response.
+ initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+ out := &r.initOut
+
+ // Introduced before FUSE kernel version 7.13.
+ out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+
+ // Introduced in FUSE kernel version 7.23.
+ if len(src) >= 4 {
+ out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ }
+ // Introduced in FUSE kernel version 7.28.
+ if len(src) >= 2 {
+ out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ }
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+ return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+ requestEntry
+
+ id linux.FUSEOpID
+ hdr *linux.FUSEHeaderIn
+ data []byte
+
+ // payload for this request: extra bytes to write after
+ // the data slice. Used by FUSE_WRITE.
+ payload []byte
+
+ // If this request is async.
+ async bool
+ // If we don't care its response.
+ // Manually set by the caller.
+ noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+ conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+ hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+ hdr := linux.FUSEHeaderIn{
+ Len: uint32(hdrLen + payload.SizeBytes()),
+ Opcode: opcode,
+ Unique: conn.fd.nextOpID,
+ NodeID: ino,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ PID: pid,
+ }
+
+ buf := make([]byte, hdr.Len)
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ hdr.MarshalBytes(buf[:hdrLen])
+ payload.MarshalBytes(buf[hdrLen:])
+
+ return &Request{
+ id: hdr.Unique,
+ hdr: &hdr,
+ data: buf,
+ }, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+ opcode linux.FUSEOpcode
+ ch chan struct{}
+ hdr *linux.FUSEHeaderOut
+ data []byte
+
+ // If this request is async.
+ async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+ return &futureResponse{
+ opcode: req.hdr.Opcode,
+ ch: make(chan struct{}),
+ async: req.async,
+ }
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+ // Return directly for async requests.
+ if f.async {
+ return nil, nil
+ }
+
+ if err := t.Block(f.ch); err != nil {
+ return nil, err
+ }
+
+ return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+ return &Response{
+ opcode: f.opcode,
+ hdr: *f.hdr,
+ data: f.data,
+ }
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+ opcode linux.FUSEOpcode
+ hdr linux.FUSEHeaderOut
+ data []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+ errno := r.hdr.Error
+ if errno >= 0 {
+ return nil
+ }
+
+ sysErrNo := syscall.Errno(-errno)
+ return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+ return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+ hdrLen := r.hdr.SizeBytes()
+ haveDataLen := r.hdr.Len - uint32(hdrLen)
+ wantDataLen := uint32(m.SizeBytes())
+
+ if haveDataLen < wantDataLen {
+ return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
+ }
+
+ // The response data is empty unless there is some payload. And so, doesn't
+ // need to be unmarshalled.
+ if r.data == nil {
+ return nil
+ }
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ m.UnmarshalBytes(r.data[hdrLen:])
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+ k, err := testutil.Boot()
+ if err != nil {
+ t.Fatalf("Error creating kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+ creds := auth.CredentialsFromContext(ctx)
+
+ k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ AllowUserMount: true,
+ })
+
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+ if err != nil {
+ t.Fatalf("NewMountNamespace(): %v", err)
+ }
+
+ return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+ vfsObj := &vfs.VirtualFilesystem{}
+ fuseDev := &DeviceFD{}
+
+ if err := vfsObj.Init(system.Ctx); err != nil {
+ return nil, nil, err
+ }
+
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef(system.Ctx)
+ if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, nil, err
+ }
+
+ fsopts := filesystemOptions{
+ maxActiveRequests: maxActiveRequests,
+ }
+ fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+ marshal.StubMarshallable
+ data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+ return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+ usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+ *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+ return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+ t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+ t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+ panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+ panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 91d2ae199..18c884b59 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -117,11 +117,12 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
d.syntheticChildren++
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
dirents []vfs.Dirent
}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 5d0f487db..94d96261b 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1026,7 +1026,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
// step is required even if !d.cachedMetadataAuthoritative() because
// d.mappings has to be updated.
// d.metadataMu has already been acquired if trunc == true.
- d.updateFileSizeLocked(0)
+ d.updateSizeLocked(0)
if d.cachedMetadataAuthoritative() {
d.touchCMtimeLocked()
@@ -1311,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !renamed.isDir() {
return syserror.EISDIR
}
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
} else {
if rp.MustBeDir() || renamed.isDir() {
return syserror.ENOTDIR
@@ -1361,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// with reference counts and queue oldParent for checkCachingLocked if the
// parent isn't actually changing.
if oldParent != newParent {
+ oldParent.decRefLocked()
ds = appendDentry(ds, oldParent)
newParent.IncRef()
if renamed.isSynthetic() {
oldParent.syntheticChildren--
newParent.syntheticChildren++
}
+ renamed.parent = newParent
}
- renamed.parent = newParent
renamed.name = newName
if newParent.children == nil {
newParent.children = make(map[string]*dentry)
@@ -1412,11 +1416,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
return err
}
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -1491,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return fs.unlinkAt(ctx, rp, false /* dir */)
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
@@ -1519,8 +1523,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1528,11 +1532,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
if err != nil {
return nil, err
}
- return d.listxattr(ctx, rp.Credentials(), size)
+ return d.listXattr(ctx, rp.Credentials(), size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1540,11 +1544,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
if err != nil {
return "", err
}
- return d.getxattr(ctx, rp.Credentials(), &opts)
+ return d.getXattr(ctx, rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1552,18 +1556,18 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ err = d.setXattr(ctx, rp.Credentials(), &opts)
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
return err
}
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1571,11 +1575,11 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ err = d.removeXattr(ctx, rp.Credentials(), name)
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
return err
}
- fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 57bff1789..8608471f8 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -62,9 +62,13 @@ import (
const Name = "9p"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -77,7 +81,7 @@ type filesystem struct {
iopts InternalFilesystemOptions
// client is the client used by this filesystem. client is immutable.
- client *p9.Client
+ client *p9.Client `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// clock is a realtime clock used to set timestamps in file operations.
clock ktime.Clock
@@ -95,7 +99,7 @@ type filesystem struct {
// reference count (such that it is usable as vfs.ResolvingPath.Start() or
// is reachable from its children), or if it is a child dentry (such that
// it is reachable from its parent).
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
// cachedDentries contains all dentries with 0 references. (Due to race
// conditions, it may also contain dentries with non-zero references.)
@@ -107,7 +111,7 @@ type filesystem struct {
// syncableDentries contains all dentries in this filesystem for which
// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
// These fields are protected by syncMu.
- syncMu sync.Mutex
+ syncMu sync.Mutex `state:"nosave"`
syncableDentries map[*dentry]struct{}
specialFileFDs map[*specialFileFD]struct{}
@@ -120,6 +124,8 @@ type filesystem struct {
// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
// have have their inodeNumber generated sequentially, with the MSB reserved to
// prevent conflicts with regular dentries.
+//
+// +stateify savable
type inodeNumber uint64
// Reserve MSB for synthetic mounts.
@@ -132,6 +138,7 @@ func inoFromPath(path uint64) inodeNumber {
return inodeNumber(path &^ syntheticInoMask)
}
+// +stateify savable
type filesystemOptions struct {
// "Standard" 9P options.
fd int
@@ -177,6 +184,8 @@ type filesystemOptions struct {
// InteropMode controls the client's interaction with other remote filesystem
// users.
+//
+// +stateify savable
type InteropMode uint32
const (
@@ -195,11 +204,7 @@ const (
// and consistent with Linux's semantics (in particular, it is not always
// possible for clients to set arbitrary atimes and mtimes depending on the
// remote filesystem implementation, and never possible for clients to set
- // arbitrary ctimes.) If a dentry containing a client-defined atime or
- // mtime is evicted from cache, client timestamps will be sent to the
- // remote filesystem on a best-effort basis to attempt to ensure that
- // timestamps will be preserved when another dentry representing the same
- // file is instantiated.
+ // arbitrary ctimes.)
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -239,6 +244,8 @@ const (
// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
type InternalFilesystemOptions struct {
// If LeakConnection is true, do not close the connection to the server
// when the Filesystem is released. This is necessary for deployments in
@@ -538,6 +545,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -567,7 +576,7 @@ type dentry struct {
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
// that does not exist on the remote filesystem. As of this writing, the
// only files that can be synthetic are sockets, pipes, and directories.
- file p9file
+ file p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// If deleted is non-zero, the file represented by this dentry has been
// deleted. deleted is accessed using atomic memory operations.
@@ -579,7 +588,7 @@ type dentry struct {
cached bool
dentryEntry
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
// If this dentry represents a directory, children contains:
//
@@ -611,7 +620,7 @@ type dentry struct {
// To mutate:
// - Lock metadataMu and use atomic operations to update because we might
// have atomic readers that don't hold the lock.
- metadataMu sync.Mutex
+ metadataMu sync.Mutex `state:"nosave"`
ino inodeNumber // immutable
mode uint32 // type is immutable, perms are mutable
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
@@ -642,7 +651,7 @@ type dentry struct {
// other metadata fields.
nlink uint32
- mapsMu sync.Mutex
+ mapsMu sync.Mutex `state:"nosave"`
// If this dentry represents a regular file, mappings tracks mappings of
// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
@@ -666,12 +675,12 @@ type dentry struct {
// either p9.File transitions from closed (isNil() == true) to open
// (isNil() == false), it may be mutated with handleMu locked, but cannot
// be closed until the dentry is destroyed.
- handleMu sync.RWMutex
- readFile p9file
- writeFile p9file
+ handleMu sync.RWMutex `state:"nosave"`
+ readFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+ writeFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
hostFD int32
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// If this dentry represents a regular file that is client-cached, cache
// maps offsets into the cached file to offsets into
@@ -837,7 +846,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
}
if mask.Size {
- d.updateFileSizeLocked(attr.Size)
+ d.updateSizeLocked(attr.Size)
}
}
@@ -991,7 +1000,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
// d.size should be kept up to date, and privatized
// copy-on-write mappings of truncated pages need to be
// invalidated, even if InteropModeShared is in effect.
- d.updateFileSizeLocked(stat.Size)
+ d.updateSizeLocked(stat.Size)
}
}
if d.fs.opts.interop == InteropModeShared {
@@ -1028,8 +1037,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return nil
}
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ err := allocate()
+ if err != nil {
+ return err
+ }
+ d.updateSizeLocked(size)
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
+}
+
// Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
d.dataMu.Lock()
oldSize := d.size
atomic.StoreUint64(&d.size, newSize)
@@ -1067,6 +1099,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ // We only support xattrs prefixed with "user." (see b/148380782). Currently,
+ // there is no need to expose any other xattrs through a gofer.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
}
@@ -1300,30 +1347,19 @@ func (d *dentry) destroyLocked(ctx context.Context) {
d.handleMu.Unlock()
if !d.file.isNil() {
- if !d.isDeleted() {
- // Write dirty timestamps back to the remote filesystem.
- atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
- mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
- if atimeDirty || mtimeDirty {
- atime := atomic.LoadInt64(&d.atime)
- mtime := atomic.LoadInt64(&d.mtime)
- if err := d.file.setAttr(ctx, p9.SetAttrMask{
- ATime: atimeDirty,
- ATimeNotSystemTime: atimeDirty,
- MTime: mtimeDirty,
- MTimeNotSystemTime: mtimeDirty,
- }, p9.SetAttr{
- ATimeSeconds: uint64(atime / 1e9),
- ATimeNanoSeconds: uint64(atime % 1e9),
- MTimeSeconds: uint64(mtime / 1e9),
- MTimeNanoSeconds: uint64(mtime % 1e9),
- }); err != nil {
- log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
- }
- }
+ // Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+ // i.e. client and server timestamps may differ (because e.g. a client
+ // write was serviced by the page cache, and only written back to the
+ // remote file later). Ideally, we'd write client timestamps back to
+ // the remote filesystem so that timestamps for a new dentry
+ // instantiated for the same file would remain coherent. Unfortunately,
+ // this turns out to be too expensive in many cases, so for now we
+ // don't do this.
+ if err := d.file.close(ctx); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
}
- d.file.close(ctx)
d.file = p9file{}
+
// Remove d from the set of syncable dentries.
d.fs.syncMu.Lock()
delete(d.fs.syncableDentries, d)
@@ -1351,9 +1387,7 @@ func (d *dentry) setDeleted() {
atomic.StoreUint32(&d.deleted, 1)
}
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
if d.file.isNil() || !d.userXattrSupported() {
return nil, nil
}
@@ -1363,6 +1397,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
}
xattrs := make([]string, 0, len(xattrMap))
for x := range xattrMap {
+ // We only support xattrs in the user.* namespace.
if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
xattrs = append(xattrs, x)
}
@@ -1370,51 +1405,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
return xattrs, nil
}
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
if d.file.isNil() {
return "", syserror.ENODATA
}
- if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return "", syserror.ENODATA
- }
return d.file.getXattr(ctx, opts.Name, opts.Size)
}
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return syserror.EPERM
- }
return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return syserror.EPERM
- }
return d.file.removeXattr(ctx, name)
}
@@ -1623,12 +1640,14 @@ func (d *dentry) decLinks() {
// fileDescription is embedded by gofer implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.LockFD
- lockLogging sync.Once
+ lockLogging sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -1666,30 +1685,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return nil
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
d := fd.dentry()
- if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+ if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
return err
}
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
d := fd.dentry()
- if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+ if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
return err
}
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 104157512..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
// handle represents a remote "open file descriptor", consisting of an opened
// fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
type handle struct {
file p9file
fd int32 // -1 if unavailable
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
return err
}
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.SetAttrClose(valid, attr)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
ctx.UninterruptibleSleepStart(false)
fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index a2e9342d5..eeaf6e444 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool {
return d.fileType() == linux.S_IFREG
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
}
@@ -79,28 +80,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
d := fd.dentry()
- d.metadataMu.Lock()
- defer d.metadataMu.Unlock()
-
- // Allocating a smaller size is a noop.
- size := offset + length
- if d.cachedMetadataAuthoritative() && size <= d.size {
- return nil
- }
-
- d.handleMu.RLock()
- err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
- d.handleMu.RUnlock()
- if err != nil {
- return err
- }
- d.dataMu.Lock()
- atomic.StoreUint64(&d.size, size)
- d.dataMu.Unlock()
- if d.cachedMetadataAuthoritative() {
- d.touchCMtimeLocked()
- }
- return nil
+ return d.doAllocate(ctx, offset, length, func() error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
}
// PRead implements vfs.FileDescriptionImpl.PRead.
@@ -915,6 +899,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
// dentryPlatformFile is only used when a host FD representing the remote file
// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
// memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
type dentryPlatformFile struct {
*dentry
@@ -927,7 +913,7 @@ type dentryPlatformFile struct {
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
- hostFileMapperInitOnce sync.Once
+ hostFileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
// IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index 85d2bee72..326b940a7 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,12 +36,14 @@ func (d *dentry) isSocket() bool {
// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
// is called and either BoundEndpoint.BidirectionalConnect or
// BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
type endpoint struct {
// dentry is the filesystem dentry which produced this endpoint.
dentry *dentry
// file is the p9 file that contains a single unopened fid.
- file p9.File
+ file p9.File `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// path is the sentry path where this endpoint is bound.
path string
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c39aa9b7..71581736c 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -33,11 +34,13 @@ import (
// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
// in effect) regular files. specialFileFD differs from regularFileFD by using
// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
type specialFileFD struct {
fileDescription
// handle is used for file I/O. handle is immutable.
- handle handle
+ handle handle `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// isRegularFile is true if this FD represents a regular file which is only
// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
@@ -55,7 +58,7 @@ type specialFileFD struct {
queue waiter.Queue
// If seekable is true, off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
}
@@ -135,6 +138,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
fd.fileDescription.EventUnregister(e)
}
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if fd.isRegularFile {
+ d := fd.dentry()
+ return d.doAllocate(ctx, offset, length, func() error {
+ return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
+ }
+ return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if fd.seekable && offset < 0 {
@@ -235,11 +248,12 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
d.touchCMtime()
}
buf := make([]byte, src.NumBytes())
- // Don't do partial writes if we get a partial read from src.
- if _, err := src.CopyIn(ctx, buf); err != nil {
- return 0, offset, err
+ copied, copyErr := src.CopyIn(ctx, buf)
+ if copied == 0 && copyErr != nil {
+ // Only return the error if we didn't get any data.
+ return 0, offset, copyErr
}
- n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
if err == syserror.EAGAIN {
err = syserror.ErrWouldBlock
}
@@ -256,7 +270,10 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
atomic.StoreUint64(&d.size, uint64(offset))
}
}
- return int64(n), offset, err
+ if err != nil {
+ return int64(n), offset, err
+ }
+ return int64(n), offset, copyErr
}
// Write implements vfs.FileDescriptionImpl.Write.
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index be1c88c82..56bcf9bdb 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -49,6 +49,7 @@ go_library(
"//pkg/fspath",
"//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
@@ -72,7 +73,6 @@ go_library(
"//pkg/unet",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/primitive",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 7561f821c..ffe4ddb32 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -58,7 +58,7 @@ func newInode(fs *filesystem, hostFD int, fileType linux.FileMode, isTTY bool) (
canMap: fileType == linux.S_IFREG,
}
i.pf.inode = i
- i.refs.EnableLeakCheck()
+ i.EnableLeakCheck()
// Non-seekable files can't be memory mapped, assert this.
if !i.seekable && i.canMap {
@@ -126,7 +126,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
// For simplicity, fileDescription.offset is set to 0. Technically, we
// should only set to 0 on files that are not seekable (sockets, pipes,
// etc.), and use the offset from the host fd otherwise when importing.
- return i.open(ctx, d.VFSDentry(), mnt, flags)
+ return i.open(ctx, d, mnt, flags)
}
// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
@@ -137,14 +137,16 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
}
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("host.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
func (filesystemType) Name() string {
return "none"
}
@@ -166,6 +168,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -185,6 +189,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeNoStatFS
kernfs.InodeNotDirectory
@@ -193,7 +199,7 @@ type inode struct {
locks vfs.FileLocks
// When the reference count reaches zero, the host fd is closed.
- refs inodeRefs
+ inodeRefs
// hostFD contains the host fd that this file was originally created from,
// which must be available at time of restore.
@@ -233,7 +239,7 @@ type inode struct {
canMap bool
// mapsMu protects mappings.
- mapsMu sync.Mutex
+ mapsMu sync.Mutex `state:"nosave"`
// If canMap is true, mappings tracks mappings of hostFD into
// memmap.MappingSpaces.
@@ -243,7 +249,7 @@ type inode struct {
pf inodePlatformFile
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -252,7 +258,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
}
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
func (i *inode) Mode() linux.FileMode {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -263,7 +269,7 @@ func (i *inode) Mode() linux.FileMode {
return linux.FileMode(s.Mode)
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
if opts.Mask&linux.STATX__RESERVED != 0 {
return linux.Statx{}, syserror.EINVAL
@@ -376,7 +382,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
}, nil
}
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
s := &opts.Stat
@@ -435,19 +441,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
return nil
}
-// IncRef implements kernfs.Inode.
-func (i *inode) IncRef() {
- i.refs.IncRef()
-}
-
-// TryIncRef implements kernfs.Inode.
-func (i *inode) TryIncRef() bool {
- return i.refs.TryIncRef()
-}
-
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *inode) DecRef(ctx context.Context) {
- i.refs.DecRef(func() {
+ i.inodeRefs.DecRef(func() {
if i.wouldBlock {
fdnotifier.RemoveFD(int32(i.hostFD))
}
@@ -457,16 +453,16 @@ func (i *inode) DecRef(ctx context.Context) {
})
}
-// Open implements kernfs.Inode.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
if i.Mode().FileType() == linux.S_IFSOCK {
return nil, syserror.ENXIO
}
- return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+ return i.open(ctx, d, rp.Mount(), opts.Flags)
}
-func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
return nil, err
@@ -490,17 +486,17 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
return nil, err
}
// Currently, we only allow Unix sockets to be imported.
- return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
+ return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
if i.isTTY {
fd := &TTYFileDescription{
fileDescription: fileDescription{inode: i},
- termios: linux.DefaultSlaveTermios,
+ termios: linux.DefaultReplicaTermios,
}
fd.LockFD.Init(&i.locks)
vfsfd := &fd.vfsfd
- if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return vfsfd, nil
@@ -509,7 +505,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
fd := &fileDescription{inode: i}
fd.LockFD.Init(&i.locks)
vfsfd := &fd.vfsfd
- if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return vfsfd, nil
@@ -521,6 +517,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
}
// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -535,40 +533,35 @@ type fileDescription struct {
inode *inode
// offsetMu protects offset.
- offsetMu sync.Mutex
+ offsetMu sync.Mutex `state:"nosave"`
// offset specifies the current file offset. It is only meaningful when
// inode.seekable is true.
offset int64
}
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
}
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
}
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (f *fileDescription) Release(context.Context) {
// noop
}
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
- if !f.inode.seekable {
- return syserror.ESPIPE
- }
-
- // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
- return syserror.EOPNOTSUPP
+ return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
}
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -578,7 +571,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
}
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -615,7 +608,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
return int64(n), err
}
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
if !f.inode.seekable {
return 0, syserror.ESPIPE
@@ -624,7 +617,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
return f.writeToHostFD(ctx, src, offset, opts.Flags)
}
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -672,7 +665,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
return int64(n), err
}
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
//
// Note that we do not support seeking on directories, since we do not even
// allow directory fds to be imported at all.
@@ -737,13 +730,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
return f.offset, nil
}
-// Sync implements FileDescriptionImpl.
+// Sync implements vfs.FileDescriptionImpl.Sync.
func (f *fileDescription) Sync(context.Context) error {
// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
return unix.Fsync(f.inode.hostFD)
}
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
if !f.inode.canMap {
return syserror.ENODEV
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 65d3af38c..b51a17bed 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -27,11 +27,13 @@ import (
// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
//
// inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
type inodePlatformFile struct {
*inode
// fdRefsMu protects fdRefs.
- fdRefsMu sync.Mutex
+ fdRefsMu sync.Mutex `state:"nosave"`
// fdRefs counts references on memmap.File offsets. It is used solely for
// memory accounting.
@@ -41,7 +43,7 @@ type inodePlatformFile struct {
fileMapper fsutil.HostFileMapper
// fileMapperInitOnce is used to lazily initialize fileMapper.
- fileMapperInitOnce sync.Once
+ fileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
// IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 7a9be4b97..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -25,11 +26,12 @@ import (
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
// descriptor that wraps a TTY FD.
+//
+// +stateify savable
type TTYFileDescription struct {
fileDescription
@@ -76,7 +78,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
t.fileDescription.Release(ctx)
}
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -94,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
return t.fileDescription.PRead(ctx, dst, offset, opts)
}
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -112,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
return t.fileDescription.Read(ctx, dst, opts)
}
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -127,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.PWrite(ctx, src, offset, opts)
}
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -142,7 +144,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.Write(ctx, src, opts)
}
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 637dca70c..5e91e0536 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -83,6 +83,7 @@ go_library(
"slot_list.go",
"static_directory_refs.go",
"symlink.go",
+ "synthetic_directory.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1ee089620..b929118b1 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -56,9 +56,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
}
// Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &DynamicBytesFD{}
- if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+ if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -87,12 +87,12 @@ type DynamicBytesFD struct {
}
// Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
fd.LockFD.Init(locks)
- if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return err
}
- fd.inode = d.Impl().(*Dentry).inode
+ fd.inode = d.inode
fd.SetDataSource(data)
return nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 6518ff5cd..0a4cd4057 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -29,6 +29,8 @@ import (
)
// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
type SeekEndConfig int
// Constants related to SEEK_END behaviour for FDs.
@@ -41,6 +43,8 @@ const (
)
// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
type GenericDirectoryFDOptions struct {
SeekEnd SeekEndConfig
}
@@ -56,6 +60,8 @@ type GenericDirectoryFDOptions struct {
// Must be initialize with Init before first use.
//
// Lock ordering: mu => children.mu.
+//
+// +stateify savable
type GenericDirectoryFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DirectoryFileDescriptionDefaultImpl
@@ -68,7 +74,7 @@ type GenericDirectoryFD struct {
children *OrderedChildren
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// off is the current directory offset. Protected by "mu".
off int64
@@ -76,12 +82,12 @@ type GenericDirectoryFD struct {
// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
// dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
fd := &GenericDirectoryFD{}
if err := fd.Init(children, locks, opts, fdOpts); err != nil {
return nil, err
}
- if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return fd, nil
@@ -195,8 +201,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
// these.
childIdx := fd.off - 2
for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
- inode := it.Dentry.Impl().(*Dentry).inode
- stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+ stat, err := it.Dentry.inode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 0e3011689..5cc1c4281 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -37,8 +37,7 @@ import (
// * !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
- d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -55,20 +54,20 @@ afterSymlink:
// calls d_revalidate(), but walk_component() => handle_dots() does not.
if name == "." {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
- if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
return nil, err
}
rp.Advance()
- return &d.parent.vfsd, nil
+ return d.parent, nil
}
if len(name) > linux.NAME_MAX {
return nil, syserror.ENAMETOOLONG
@@ -79,7 +78,7 @@ afterSymlink:
if err != nil {
return nil, err
}
- if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
return nil, err
}
// Resolve any symlink at current path component.
@@ -102,7 +101,7 @@ afterSymlink:
goto afterSymlink
}
rp.Advance()
- return &next.vfsd, nil
+ return next, nil
}
// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
@@ -122,25 +121,21 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
if !child.inode.Valid(ctx) {
delete(parent.children, name)
vfsObj.InvalidateDentry(ctx, &child.vfsd)
- fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+ fs.deferDecRef(child) // Reference from Lookup.
child = nil
}
}
if child == nil {
- // Dentry isn't cached; it either doesn't exist or failed
- // revalidation. Attempt to resolve it via Lookup.
- //
- // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
- // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
- // that all dentries in the filesystem are (kernfs.)Dentry and performs
- // vfs.DentryImpl casts accordingly.
- childVFSD, err := parent.inode.Lookup(ctx, name)
+ // Dentry isn't cached; it either doesn't exist or failed revalidation.
+ // Attempt to resolve it via Lookup.
+ c, err := parent.inode.Lookup(ctx, name)
if err != nil {
return nil, err
}
- // Reference on childVFSD dropped by a corresponding Valid.
- child = childVFSD.Impl().(*Dentry)
- parent.insertChildLocked(name, child)
+ // Reference on c (provided by Lookup) will be dropped when the dentry
+ // fails validation.
+ parent.InsertChildLocked(name, c)
+ child = c
}
return child, nil
}
@@ -153,20 +148,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// Preconditions: Filesystem.mu must be locked for at least reading.
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
- vfsd := rp.Start()
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+ d := rp.Start().Impl().(*Dentry)
for !rp.Done() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
- return nil, nil, err
+ return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if rp.MustBeDir() && !d.isDir() {
- return nil, nil, syserror.ENOTDIR
+ return nil, syserror.ENOTDIR
}
- return vfsd, d.inode, nil
+ return d, nil
}
// walkParentDirLocked resolves all but the last path component of rp to an
@@ -181,20 +175,19 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
// * !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
- vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+ d := rp.Start().Impl().(*Dentry)
for !rp.Final() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
- return nil, nil, err
+ return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
- return nil, nil, syserror.ENOTDIR
+ return nil, syserror.ENOTDIR
}
- return vfsd, d.inode, nil
+ return d, nil
}
// checkCreateLocked checks that a file named rp.Component() may be created in
@@ -202,10 +195,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
-// * parentInode == parentVFSD.Impl().(*Dentry).Inode.
// * isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return "", err
}
pc := rp.Component()
@@ -215,11 +207,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
if len(pc) > linux.NAME_MAX {
return "", syserror.ENAMETOOLONG
}
- // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
- if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+ if _, ok := parent.children[pc]; ok {
return "", syserror.EEXIST
}
- if parentVFSD.IsDead() {
+ if parent.VFSDentry().IsDead() {
return "", syserror.ENOENT
}
return pc, nil
@@ -228,8 +219,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
// checkDeleteLocked checks that the file represented by vfsd may be deleted.
//
// Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
- parent := vfsd.Impl().(*Dentry).parent
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
+ parent := d.parent
if parent == nil {
return syserror.EBUSY
}
@@ -258,11 +249,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
- return inode.CheckPermissions(ctx, creds, ats)
+ return d.inode.CheckPermissions(ctx, creds, ats)
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -270,20 +261,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
fs.mu.RLock()
defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
if opts.CheckSearchable {
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, syserror.ENOTDIR
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
}
+ vfsd := d.VFSDentry()
vfsd.IncRef() // Ownership transferred to caller.
return vfsd, nil
}
@@ -293,12 +284,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
fs.mu.RLock()
defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+ d, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return nil, err
}
- vfsd.IncRef() // Ownership transferred to caller.
- return vfsd, nil
+ d.IncRef() // Ownership transferred to caller.
+ return d.VFSDentry(), nil
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -308,12 +299,15 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -330,11 +324,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EPERM
}
- childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+ child, err := parent.inode.NewLink(ctx, pc, d.inode)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChildLocked(pc, child)
return nil
}
@@ -345,12 +339,15 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -358,11 +355,14 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+ child, err := parent.inode.NewDir(ctx, pc, opts)
if err != nil {
- return err
+ if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+ return err
+ }
+ child = newSyntheticDirectory(rp.Credentials(), opts.Mode)
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChildLocked(pc, child)
return nil
}
@@ -373,12 +373,15 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -386,11 +389,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+ newD, err := parent.inode.NewNode(ctx, pc, opts)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+ parent.InsertChildLocked(pc, newD)
return nil
}
@@ -406,28 +409,27 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
return nil, err
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
return nil, err
}
- inode.IncRef()
- defer inode.DecRef(ctx)
+ d.inode.IncRef()
+ defer d.inode.DecRef(ctx)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
- return inode.Open(ctx, rp, vfsd, opts)
+ return d.inode.Open(ctx, rp, d, opts)
}
// May create new file.
mustCreate := opts.Flags&linux.O_EXCL != 0
- vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ d := rp.Start().Impl().(*Dentry)
fs.mu.Lock()
unlocked := false
unlock := func() {
@@ -444,22 +446,22 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- inode.IncRef()
- defer inode.DecRef(ctx)
+ d.inode.IncRef()
+ defer d.inode.DecRef(ctx)
unlock()
- return inode.Open(ctx, rp, vfsd, opts)
+ return d.inode.Open(ctx, rp, d, opts)
}
afterTrailingSymlink:
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return nil, err
}
// Check for search permission in the parent directory.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
// Reject attempts to open directories with O_CREAT.
@@ -474,10 +476,10 @@ afterTrailingSymlink:
return nil, syserror.ENAMETOOLONG
}
// Determine whether or not we need to create a file.
- childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+ child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
if err == syserror.ENOENT {
// Already checked for searchability above; now check for writability.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -485,16 +487,18 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+ child, err := parent.inode.NewFile(ctx, pc, opts)
if err != nil {
return nil, err
}
- child := childVFSD.Impl().(*Dentry)
- parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ // FIXME(gvisor.dev/issue/1193): Race between checking existence with
+ // fs.stepExistingLocked and parent.InsertChild. If possible, we should hold
+ // dirMu from one to the other.
+ parent.InsertChild(pc, child)
child.inode.IncRef()
defer child.inode.DecRef(ctx)
unlock()
- return child.inode.Open(ctx, rp, childVFSD, opts)
+ return child.inode.Open(ctx, rp, child, opts)
}
if err != nil {
return nil, err
@@ -503,7 +507,6 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- child := childVFSD.Impl().(*Dentry)
if rp.ShouldFollowSymlink() && child.isSymlink() {
targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
if err != nil {
@@ -530,22 +533,22 @@ afterTrailingSymlink:
child.inode.IncRef()
defer child.inode.DecRef(ctx)
unlock()
- return child.inode.Open(ctx, rp, &child.vfsd, opts)
+ return child.inode.Open(ctx, rp, child, opts)
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
- d, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
return "", err
}
- if !d.Impl().(*Dentry).isSymlink() {
+ if !d.isSymlink() {
return "", syserror.EINVAL
}
- return inode.Readlink(ctx)
+ return d.inode.Readlink(ctx, rp.Mount())
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -562,11 +565,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Resolve the destination directory first to verify that it's on this
// Mount.
- dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+ dstDir, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- dstDir := dstDirVFSD.Impl().(*Dentry)
mnt := rp.Mount()
if mnt != oldParentVD.Mount() {
return syserror.EXDEV
@@ -584,16 +586,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err != nil {
return err
}
- srcVFSD := &src.vfsd
// Can we remove the src dentry?
- if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+ if err := checkDeleteLocked(ctx, rp, src); err != nil {
return err
}
// Can we create the dst dentry?
var dst *Dentry
- pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+ pc, err := checkCreateLocked(ctx, rp, dstDir)
switch err {
case nil:
// Ok, continue with rename as replacement.
@@ -604,14 +605,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
dst = dstDir.children[pc]
if dst == nil {
- panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+ panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
}
default:
return err
}
var dstVFSD *vfs.Dentry
if dst != nil {
- dstVFSD = &dst.vfsd
+ dstVFSD = dst.VFSDentry()
}
mntns := vfs.MountNamespaceFromContext(ctx)
@@ -627,17 +628,18 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
defer dstDir.dirMu.Unlock()
}
+ srcVFSD := src.VFSDentry()
if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
return err
}
- replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+ replaced, err := srcDir.inode.Rename(ctx, src.name, pc, src, dstDir)
if err != nil {
virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
return err
}
delete(srcDir.children, src.name)
if srcDir != dstDir {
- fs.deferDecRef(srcDirVFSD)
+ fs.deferDecRef(srcDir)
dstDir.IncRef()
}
src.parent = dstDir
@@ -646,7 +648,11 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
dstDir.children = make(map[string]*Dentry)
}
dstDir.children[pc] = src
- virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+ var replaceVFSD *vfs.Dentry
+ if replaced != nil {
+ replaceVFSD = replaced.VFSDentry()
+ }
+ virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
return nil
}
@@ -654,7 +660,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
@@ -663,14 +670,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+ if err := checkDeleteLocked(ctx, rp, d); err != nil {
return err
}
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return syserror.ENOTDIR
}
- if inode.HasChildren() {
+ if d.inode.HasChildren() {
return syserror.ENOTEMPTY
}
virtfs := rp.VirtualFilesystem()
@@ -680,10 +686,12 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef(ctx)
+ vfsd := d.VFSDentry()
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+ if err := parentDentry.inode.RmDir(ctx, d.name, d); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -694,7 +702,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
@@ -703,31 +711,31 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if opts.Stat.Mask == 0 {
return nil
}
- return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+ return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statx{}, err
}
- return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+ return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statfs{}, err
}
- return inode.StatFS(ctx, fs.VFSFilesystem())
+ return d.inode.StatFS(ctx, fs.VFSFilesystem())
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -737,12 +745,15 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -750,11 +761,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+ child, err := parent.inode.NewSymlink(ctx, pc, target)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChildLocked(pc, child)
return nil
}
@@ -762,7 +773,8 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
@@ -771,10 +783,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+ if err := checkDeleteLocked(ctx, rp, d); err != nil {
return err
}
- d := vfsd.Impl().(*Dentry)
if d.isDir() {
return syserror.EISDIR
}
@@ -784,10 +795,11 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer parentDentry.dirMu.Unlock()
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef(ctx)
+ vfsd := d.VFSDentry()
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, d); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -795,25 +807,25 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
return nil, err
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
+ _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
@@ -823,10 +835,10 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
+ _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
@@ -836,10 +848,10 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
+ _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
@@ -849,10 +861,10 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
+ _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c0b863ba4..49210e748 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -31,6 +31,8 @@ import (
// count for inodes, performing no extra actions when references are obtained or
// released. This is suitable for simple file inodes that don't reference any
// resources.
+//
+// +stateify savable
type InodeNoopRefCount struct {
}
@@ -50,30 +52,32 @@ func (InodeNoopRefCount) TryIncRef() bool {
// InodeDirectoryNoNewChildren partially implements the Inode interface.
// InodeDirectoryNoNewChildren represents a directory inode which does not
// support creation of new children.
+//
+// +stateify savable
type InodeDirectoryNoNewChildren struct{}
// NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
@@ -81,6 +85,8 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
// represent directories can embed this to provide no-op implementations for
// directory-related functions.
+//
+// +stateify savable
type InodeNotDirectory struct {
}
@@ -90,47 +96,47 @@ func (InodeNotDirectory) HasChildren() bool {
}
// NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
panic("NewFile called on non-directory inode")
}
// NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
panic("NewDir called on non-directory inode")
}
// NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*Dentry, error) {
panic("NewLink called on non-directory inode")
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*Dentry, error) {
panic("NewSymlink called on non-directory inode")
}
// NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
panic("NewNode called on non-directory inode")
}
// Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, *Dentry) error {
panic("Unlink called on non-directory inode")
}
// RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, *Dentry) error {
panic("RmDir called on non-directory inode")
}
// Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, *Dentry, *Dentry) (*Dentry, error) {
panic("Rename called on non-directory inode")
}
// Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*Dentry, error) {
panic("Lookup called on non-directory inode")
}
@@ -149,10 +155,12 @@ func (InodeNotDirectory) Valid(context.Context) bool {
// dymanic entries (i.e. entries that are not "hashed" into the
// vfs.Dentry.children) can embed this to provide no-op implementations for
// functions related to dynamic entries.
+//
+// +stateify savable
type InodeNoDynamicLookup struct{}
// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*Dentry, error) {
return nil, syserror.ENOENT
}
@@ -169,10 +177,12 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
// InodeNotSymlink partially implements the Inode interface, specifically the
// inodeSymlink sub interface. All inodes that are not symlinks may embed this
// to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
type InodeNotSymlink struct{}
// Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
return "", syserror.EINVAL
}
@@ -186,6 +196,8 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
// inode attributes.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type InodeAttrs struct {
devMajor uint32
devMinor uint32
@@ -256,12 +268,29 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
// SetStat implements Inode.SetStat.
func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ return a.SetInodeStat(ctx, fs, creds, opts)
+}
+
+// SetInodeStat sets the corresponding attributes from opts to InodeAttrs.
+// This function can be used by other kernfs-based filesystem implementation to
+// sets the unexported attributes into kernfs.InodeAttrs.
+func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask == 0 {
return nil
}
- if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+ // Note that not all fields are modifiable. For example, the file type and
+ // inode numbers are immutable after node creation. Setting the size is often
+ // allowed by kernfs files but does not do anything. If some other behavior is
+ // needed, the embedder should consider extending SetStat.
+ //
+ // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+ if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 {
return syserror.EPERM
}
+ if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+ return syserror.EISDIR
+ }
if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
}
@@ -284,13 +313,6 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
atomic.StoreUint32(&a.gid, stat.GID)
}
- // Note that not all fields are modifiable. For example, the file type and
- // inode numbers are immutable after node creation.
-
- // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
- // Also, STATX_SIZE will need some special handling, because read-only static
- // files should return EIO for truncate operations.
-
return nil
}
@@ -320,13 +342,16 @@ func (a *InodeAttrs) DecLinks() {
}
}
+// +stateify savable
type slot struct {
Name string
- Dentry *vfs.Dentry
+ Dentry *Dentry
slotEntry
}
// OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
type OrderedChildrenOptions struct {
// Writable indicates whether vfs.FilesystemImpl methods implemented by
// OrderedChildren may modify the tracked children. This applies to
@@ -342,12 +367,14 @@ type OrderedChildrenOptions struct {
// directories.
//
// Must be initialize with Init before first use.
+//
+// +stateify savable
type OrderedChildren struct {
// Can children be modified by user syscalls? It set to false, interface
// methods that would modify the children return EPERM. Immutable.
writable bool
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
order slotList
set map[string]*slot
}
@@ -380,7 +407,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3
if child.isDir() {
links++
}
- if err := o.Insert(name, child.VFSDentry()); err != nil {
+ if err := o.Insert(name, child); err != nil {
panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
}
d.InsertChild(name, child)
@@ -397,7 +424,7 @@ func (o *OrderedChildren) HasChildren() bool {
// Insert inserts child into o. This ignores the writability of o, as this is
// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Insert(name string, child *Dentry) error {
o.mu.Lock()
defer o.mu.Unlock()
if _, ok := o.set[name]; ok {
@@ -421,10 +448,10 @@ func (o *OrderedChildren) removeLocked(name string) {
}
// Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(name string, new *Dentry) *Dentry {
if s, ok := o.set[name]; ok {
// Existing slot with given name, simply replace the dentry.
- var old *vfs.Dentry
+ var old *Dentry
old, s.Dentry = s.Dentry, new
return old
}
@@ -440,7 +467,7 @@ func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.
}
// Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child *Dentry) error {
s, ok := o.set[name]
if !ok {
return syserror.ENOENT
@@ -452,7 +479,7 @@ func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) er
}
// Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry) error {
if !o.writable {
return syserror.EPERM
}
@@ -468,12 +495,13 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
}
// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *Dentry) error {
// We're not responsible for checking that child is a directory, that it's
// empty, or updating any link counts; so this is the same as unlink.
return o.Unlink(ctx, name, child)
}
+// +stateify savable
type renameAcrossDifferentImplementationsError struct{}
func (renameAcrossDifferentImplementationsError) Error() string {
@@ -489,8 +517,8 @@ func (renameAcrossDifferentImplementationsError) Error() string {
// that will support Rename.
//
// Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
- dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (*Dentry, error) {
+ dst, ok := dstDir.inode.(interface{}).(*OrderedChildren)
if !ok {
return nil, renameAcrossDifferentImplementationsError{}
}
@@ -532,12 +560,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
}
// InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
type InodeSymlink struct {
InodeNotDirectory
}
// Open implements Inode.Open.
-func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return nil, syserror.ELOOP
}
@@ -564,6 +594,7 @@ var _ Inode = (*StaticDirectory)(nil)
func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry {
inode := &StaticDirectory{}
inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts)
+ inode.EnableLeakCheck()
dentry := &Dentry{}
dentry.Init(inode)
@@ -584,9 +615,9 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}
-// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
+// Open implements kernfs.Inode.Open.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
if err != nil {
return nil, err
}
@@ -598,21 +629,25 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (s *StaticDirectory) DecRef(context.Context) {
s.StaticDirectoryRefs.DecRef(s.Destroy)
}
// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+//
+// +stateify savable
type AlwaysValid struct{}
-// Valid implements kernfs.inodeDynamicLookup.
+// Valid implements kernfs.inodeDynamicLookup.Valid.
func (*AlwaysValid) Valid(context.Context) bool {
return true
}
// InodeNoStatFS partially implements the Inode interface, where the client
// filesystem doesn't support statfs(2).
+//
+// +stateify savable
type InodeNoStatFS struct{}
// StatFS implements Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 88fcd54aa..6d3d79333 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,7 +29,7 @@
//
// Reference Model:
//
-// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// Kernfs dentries represents named pointers to inodes. Dentries and inodes have
// independent lifetimes and reference counts. A child dentry unconditionally
// holds a reference on its parent directory's dentry. A dentry also holds a
// reference on the inode it points to. Multiple dentries can point to the same
@@ -60,20 +60,23 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
// filesystem. Concrete implementations are expected to embed this in their own
// Filesystem type.
+//
+// +stateify savable
type Filesystem struct {
vfsfs vfs.Filesystem
- droppedDentriesMu sync.Mutex
+ droppedDentriesMu sync.Mutex `state:"nosave"`
// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
// used to defer dentry destruction until mu can be acquired for
// writing. Protected by droppedDentriesMu.
- droppedDentries []*vfs.Dentry
+ droppedDentries []*Dentry
// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
// for reading guarantees continued existence of any resolved dentries, but
@@ -96,7 +99,7 @@ type Filesystem struct {
// defer fs.mu.RUnlock()
// ...
// fs.deferDecRef(dentry)
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// nextInoMinusOne is used to to allocate inode numbers on this
// filesystem. Must be accessed by atomic operations.
@@ -107,7 +110,7 @@ type Filesystem struct {
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
//
// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+func (fs *Filesystem) deferDecRef(d *Dentry) {
fs.droppedDentriesMu.Lock()
fs.droppedDentries = append(fs.droppedDentries, d)
fs.droppedDentriesMu.Unlock()
@@ -159,6 +162,8 @@ const (
// to, and child dentries hold a reference on their parent.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type Dentry struct {
DentryRefs
@@ -172,7 +177,11 @@ type Dentry struct {
name string
// dirMu protects children and the names of child Dentries.
- dirMu sync.Mutex
+ //
+ // Note that holding fs.mu for writing is not sufficient;
+ // revalidateChildLocked(), which is a very hot path, may modify children with
+ // fs.mu acquired for reading only.
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*Dentry
inode Inode
@@ -239,24 +248,25 @@ func (d *Dentry) Watches() *vfs.Watches {
func (d *Dentry) OnZeroWatches(context.Context) {}
// InsertChild inserts child into the vfs dentry cache with the given name under
-// this dentry. This does not update the directory inode, so calling this on
-// its own isn't sufficient to insert a child into a directory. InsertChild
-// updates the link count on d if required.
+// this dentry. This does not update the directory inode, so calling this on its
+// own isn't sufficient to insert a child into a directory.
//
// Precondition: d must represent a directory inode.
func (d *Dentry) InsertChild(name string, child *Dentry) {
d.dirMu.Lock()
- d.insertChildLocked(name, child)
+ d.InsertChildLocked(name, child)
d.dirMu.Unlock()
}
-// insertChildLocked is equivalent to InsertChild, with additional
+// InsertChildLocked is equivalent to InsertChild, with additional
// preconditions.
//
-// Precondition: d.dirMu must be locked.
-func (d *Dentry) insertChildLocked(name string, child *Dentry) {
+// Preconditions:
+// * d must represent a directory inode.
+// * d.dirMu must be locked.
+func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
if !d.isDir() {
- panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+ panic(fmt.Sprintf("InsertChildLocked called on non-directory Dentry: %+v.", d))
}
d.IncRef() // DecRef in child's Dentry.destroy.
child.parent = d
@@ -267,6 +277,36 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) {
d.children[name] = child
}
+// RemoveChild removes child from the vfs dentry cache. This does not update the
+// directory inode or modify the inode to be unlinked. So calling this on its own
+// isn't sufficient to remove a child from a directory.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) RemoveChild(name string, child *Dentry) error {
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ return d.RemoveChildLocked(name, child)
+}
+
+// RemoveChildLocked is equivalent to RemoveChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) RemoveChildLocked(name string, child *Dentry) error {
+ if !d.isDir() {
+ panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d))
+ }
+ c, ok := d.children[name]
+ if !ok {
+ return syserror.ENOENT
+ }
+ if c != child {
+ panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child))
+ }
+ delete(d.children, name)
+ return nil
+}
+
// Inode returns the dentry's inode.
func (d *Dentry) Inode() Inode {
return d.inode
@@ -287,7 +327,6 @@ func (d *Dentry) Inode() Inode {
//
// - Checking that dentries passed to methods are of the appropriate file type.
// - Checking permissions.
-// - Updating link and reference counts.
//
// Specific responsibilities of implementations are documented below.
type Inode interface {
@@ -297,7 +336,8 @@ type Inode interface {
inodeRefs
// Methods related to node metadata. A generic implementation is provided by
- // InodeAttrs.
+ // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
+ // managing link counts.
inodeMetadata
// Method for inodes that represent symlink. InodeNotSymlink provides a
@@ -315,11 +355,11 @@ type Inode interface {
// Open creates a file description for the filesystem object represented by
// this inode. The returned file description should hold a reference on the
- // inode for its lifetime.
+ // dentry for its lifetime.
//
// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
// the inode on which Open() is being called.
- Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+ Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
// StatFS returns filesystem statistics for the client filesystem. This
// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
@@ -369,30 +409,30 @@ type inodeDirectory interface {
HasChildren() bool
// NewFile creates a new regular file inode.
- NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+ NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error)
// NewDir creates a new directory inode.
- NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+ NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error)
// NewLink creates a new hardlink to a specified inode in this
// directory. Implementations should create a new kernfs Dentry pointing to
// target, and update target's link count.
- NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+ NewLink(ctx context.Context, name string, target Inode) (*Dentry, error)
// NewSymlink creates a new symbolic link inode.
- NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+ NewSymlink(ctx context.Context, name, target string) (*Dentry, error)
// NewNode creates a new filesystem node for a mknod syscall.
- NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+ NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error)
// Unlink removes a child dentry from this directory inode.
- Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+ Unlink(ctx context.Context, name string, child *Dentry) error
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
// is a directory, checking for an empty directory.
- RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+ RmDir(ctx context.Context, name string, child *Dentry) error
// Rename is called on the source directory containing an inode being
// renamed. child should point to the resolved child in the source
@@ -400,7 +440,7 @@ type inodeDirectory interface {
// should return the replaced dentry or nil otherwise.
//
// Precondition: Caller must serialize concurrent calls to Rename.
- Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+ Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (replaced *Dentry, err error)
}
type inodeDynamicLookup interface {
@@ -418,14 +458,14 @@ type inodeDynamicLookup interface {
//
// Lookup returns the child with an extra reference and the caller owns this
// reference.
- Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+ Lookup(ctx context.Context, name string) (*Dentry, error)
// Valid should return true if this inode is still valid, or needs to
// be resolved again by a call to Lookup.
Valid(ctx context.Context) bool
// IterDirents is used to iterate over dynamically created entries. It invokes
- // cb on each entry in the directory represented by the FileDescription.
+ // cb on each entry in the directory represented by the Inode.
// 'offset' is the offset for the entire IterDirents call, which may include
// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
// inside the entries returned by this IterDirents invocation. In other words,
@@ -437,7 +477,7 @@ type inodeDynamicLookup interface {
type inodeSymlink interface {
// Readlink returns the target of a symbolic link. If an inode is not a
// symlink, the implementation should return EINVAL.
- Readlink(ctx context.Context) (string, error)
+ Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
// Getlink returns the target of a symbolic link, as used by path
// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 675587c6b..e413242dc 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+ mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create testfs root mount: %v", err)
}
@@ -121,8 +121,8 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
return &dir.dentry
}
-func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndStaticEntries,
})
if err != nil {
@@ -162,8 +162,8 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
return &dir.dentry
}
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndStaticEntries,
})
if err != nil {
@@ -176,38 +176,36 @@ func (d *dir) DecRef(context.Context) {
d.dirRefs.DecRef(d.Destroy)
}
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
creds := auth.CredentialsFromContext(ctx)
dir := d.fs.newDir(creds, opts.Mode, nil)
- dirVFSD := dir.VFSDentry()
- if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+ if err := d.OrderedChildren.Insert(name, dir); err != nil {
dir.DecRef(ctx)
return nil, err
}
d.IncLinks(1)
- return dirVFSD, nil
+ return dir, nil
}
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
creds := auth.CredentialsFromContext(ctx)
f := d.fs.newFile(creds, "")
- fVFSD := f.VFSDentry()
- if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+ if err := d.OrderedChildren.Insert(name, f); err != nil {
f.DecRef(ctx)
return nil, err
}
- return fVFSD, nil
+ return f, nil
}
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 64731a3e4..58a93eaac 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,6 +24,8 @@ import (
// StaticSymlink provides an Inode implementation for symlinks that point to
// a immutable target.
+//
+// +stateify savable
type StaticSymlink struct {
InodeAttrs
InodeNoopRefCount
@@ -51,8 +53,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
}
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
return s.target, nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..ea7f073eb
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+ InodeAttrs
+ InodeNoStatFS
+ InodeNoopRefCount
+ InodeNoDynamicLookup
+ InodeNotSymlink
+ OrderedChildren
+
+ locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *Dentry {
+ inode := &syntheticDirectory{}
+ inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+ d := &Dentry{}
+ d.Init(inode)
+ return d
+}
+
+func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+ if perm&^linux.PermissionsMask != 0 {
+ panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+ }
+ dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+ dir.OrderedChildren.Init(OrderedChildrenOptions{
+ Writable: true,
+ })
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+ if err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) {
+ if !opts.ForSyntheticMountpoint {
+ return nil, syserror.EPERM
+ }
+ subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+ if err := dir.OrderedChildren.Insert(name, subdird); err != nil {
+ subdird.DecRef(ctx)
+ return nil, err
+ }
+ return subdird, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 13735eb05..73b126669 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -81,6 +82,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
Start: d.parent.upperVD,
Path: fspath.Parse(d.name),
}
+ // Used during copy-up of memory-mapped regular files.
+ var mmapOpts *memmap.MMapOpts
cleanupUndoCopyUp := func() {
var err error
if ftype == linux.S_IFDIR {
@@ -89,7 +92,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
}
if err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+ }
+ if d.upperVD.Ok() {
+ d.upperVD.DecRef(ctx)
+ d.upperVD = vfs.VirtualDentry{}
}
}
switch ftype {
@@ -132,6 +139,25 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
break
}
}
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if d.wrappedMappable != nil {
+ // We may have memory mappings of the file on the lower layer.
+ // Switch to mapping the file on the upper layer instead.
+ mmapOpts = &memmap.MMapOpts{
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.ReadWrite,
+ }
+ if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ if mmapOpts.MappingIdentity != nil {
+ mmapOpts.MappingIdentity.DecRef(ctx)
+ }
+ // Don't actually switch Mappables until the end of copy-up; see
+ // below for why.
+ }
if err := newFD.SetStat(ctx, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_UID | linux.STATX_GID,
@@ -234,7 +260,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
panic(fmt.Sprintf("unexpected file type %o", ftype))
}
- // TODO(gvisor.dev/issue/1199): copy up xattrs
+ if err := d.copyXattrsLocked(ctx); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
// Update the dentry's device and inode numbers (except for directories,
// for which these remain overlay-assigned).
@@ -246,14 +275,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
Mask: linux.STATX_INO,
})
if err != nil {
- d.upperVD.DecRef(ctx)
- d.upperVD = vfs.VirtualDentry{}
cleanupUndoCopyUp()
return err
}
if upperStat.Mask&linux.STATX_INO == 0 {
- d.upperVD.DecRef(ctx)
- d.upperVD = vfs.VirtualDentry{}
cleanupUndoCopyUp()
return syserror.EREMOTE
}
@@ -262,6 +287,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
atomic.StoreUint64(&d.ino, upperStat.Ino)
}
+ if mmapOpts != nil && mmapOpts.Mappable != nil {
+ // Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+ // (from the S_IFREG path above).
+
+ // Propagate mappings of d to the new Mappable. Remember which mappings
+ // we added so we can remove them on failure.
+ upperMappable := mmapOpts.Mappable
+ allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ added := make(memmap.MappingsOfRange)
+ for m := range seg.Value() {
+ if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+ for m := range added {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ for mr, mappings := range allAdded {
+ for m := range mappings {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+ }
+ }
+ return err
+ }
+ added[m] = struct{}{}
+ }
+ allAdded[seg.Range()] = added
+ }
+
+ // Switch to the new Mappable. We do this at the end of copy-up
+ // because:
+ //
+ // - We need to switch Mappables (by changing d.wrappedMappable) before
+ // invalidating Translations from the old Mappable (to pick up
+ // Translations from the new one).
+ //
+ // - We need to lock d.dataMu while changing d.wrappedMappable, but
+ // must invalidate Translations with d.dataMu unlocked (due to lock
+ // ordering).
+ //
+ // - Consequently, once we unlock d.dataMu, other threads may
+ // immediately observe the new (copied-up) Mappable, which we want to
+ // delay until copy-up is guaranteed to succeed.
+ d.dataMu.Lock()
+ lowerMappable := d.wrappedMappable
+ d.wrappedMappable = upperMappable
+ d.dataMu.Unlock()
+ d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Remove mappings from the old Mappable.
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ for m := range seg.Value() {
+ lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ }
+ d.lowerMappings.RemoveAll()
+ }
+
atomic.StoreUint32(&d.copiedUp, 1)
return nil
}
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+ upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+ lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+ if err != nil {
+ if err == syserror.EOPNOTSUPP {
+ // There are no guarantees as to the contents of lowerXattrs.
+ return nil
+ }
+ ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+ return err
+ }
+
+ for _, name := range lowerXattrs {
+ // Do not copy up overlay attributes.
+ if isOverlayXattr(name) {
+ continue
+ }
+
+ value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+ if err != nil {
+ ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+ return err
+ }
+
+ if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+ ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+ return err
+ }
+ }
+ return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+ dirents, err := d.getDirentsLocked(ctx)
+ if err != nil {
+ return err
+ }
+ for _, dirent := range dirents {
+ if dirent.Name == "." || dirent.Name == ".." {
+ continue
+ }
+ child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+ if err != nil {
+ return err
+ }
+ if err := child.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ if child.isDir() {
+ child.dirMu.Lock()
+ err := child.copyUpDescendantsLocked(ctx, ds)
+ child.dirMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index b1b292e83..df4492346 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -100,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string
return whiteouts, readdirErr
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
dirents []vfs.Dirent
}
@@ -116,10 +117,12 @@ func (fd *directoryFD) Release(ctx context.Context) {
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ d := fd.dentry()
+ defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
fd.mu.Lock()
defer fd.mu.Unlock()
- d := fd.dentry()
if fd.dirents == nil {
ds, err := d.getDirents(ctx)
if err != nil {
@@ -143,7 +146,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
defer d.fs.renameMu.RUnlock()
d.dirMu.Lock()
defer d.dirMu.Unlock()
+ return d.getDirentsLocked(ctx)
+}
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
if d.dirents != nil {
return d.dirents, nil
}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index e720bfb0b..bd11372d5 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,8 @@
package overlay
import (
+ "fmt"
+ "strings"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +29,15 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
// opaque directories.
// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = linux.XATTR_TRUSTED_PREFIX + "overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
func isWhiteout(stat *linux.Statx) bool {
return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -205,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
lookupErr = err
return false
}
+ defer childVD.DecRef(ctx)
mask := uint32(linux.STATX_TYPE)
if !existsOnAnyLayer {
@@ -243,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
}
// Update child to include this layer.
+ childVD.IncRef()
if isUpper {
child.upperVD = childVD
child.copiedUp = 1
@@ -267,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
// Directories are merged with directories from lower layers if they
// are not explicitly opaque.
- opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+ opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
Root: childVD,
Start: childVD,
- }, &vfs.GetxattrOptions{
+ }, &vfs.GetXattrOptions{
Name: _OVL_XATTR_OPAQUE,
Size: 1,
})
@@ -490,7 +499,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
return err
}
+
parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
return nil
}
@@ -504,7 +519,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
}
}
@@ -616,12 +631,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
}
return err
}
+ old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
return nil
})
}
@@ -655,7 +671,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -665,12 +681,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// There may be directories on lower layers (previously hidden by
// the whiteout) that the new directory should not be merged with.
// Mark it opaque to prevent merging.
- if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
Name: _OVL_XATTR_OPAQUE,
Value: "y",
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
} else {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -714,7 +730,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -743,6 +759,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
start := rp.Start().Impl().(*dentry)
if rp.Done() {
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
if mustCreate {
return nil, syserror.EEXIST
}
@@ -766,6 +785,10 @@ afterTrailingSymlink:
if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -774,12 +797,11 @@ afterTrailingSymlink:
parent.dirMu.Unlock()
return fd, err
}
+ parent.dirMu.Unlock()
if err != nil {
- parent.dirMu.Unlock()
return nil, err
}
// Open existing child or follow symlink.
- parent.dirMu.Unlock()
if mustCreate {
return nil, syserror.EEXIST
}
@@ -794,6 +816,9 @@ afterTrailingSymlink:
start = parent
goto afterTrailingSymlink
}
+ if rp.MustBeDir() && !child.isDir() {
+ return nil, syserror.ENOTDIR
+ }
if mayWrite {
if err := child.copyUpLocked(ctx); err != nil {
return nil, err
@@ -925,7 +950,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -936,7 +961,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
child, err := fs.getChildLocked(ctx, parent, childName, ds)
if err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -957,6 +982,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
// just can't open it anymore for some reason.
return nil, err
}
+ parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
return &fd.vfsfd, nil
}
@@ -1002,9 +1028,224 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
defer mnt.EndWrite()
- // FIXME(gvisor.dev/issue/1199): Actually implement rename.
- _ = newParent
- return syserror.EXDEV
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ // We need a dentry representing the renamed file since, if it's a
+ // directory, we need to check for write permission on it.
+ oldParent.dirMu.Lock()
+ defer oldParent.dirMu.Unlock()
+ renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+ if err != nil {
+ return err
+ }
+ if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+ return err
+ }
+ if renamed.isDir() {
+ if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if oldParent != newParent {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ newParent.dirMu.Lock()
+ defer newParent.dirMu.Unlock()
+ }
+ if newParent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+ if err != nil {
+ return err
+ }
+ var (
+ replaced *dentry
+ replacedVFSD *vfs.Dentry
+ whiteouts map[string]bool
+ )
+ if replacedLayer.existsInOverlay() {
+ replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil {
+ return err
+ }
+ replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
+ replaced.dirMu.Lock()
+ defer replaced.dirMu.Unlock()
+ whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ }
+
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+
+ // renamed and oldParent need to be copied-up before they're renamed on the
+ // upper layer.
+ if err := renamed.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If renamed is a directory, all of its descendants need to be copied-up
+ // before they're renamed on the upper layer.
+ if renamed.isDir() {
+ if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+ return err
+ }
+ }
+ // newParent must be copied-up before it can contain renamed on the upper
+ // layer.
+ if err := newParent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If replaced exists, it doesn't need to be copied-up, but we do need to
+ // serialize with copy-up. Holding renameMu for writing should be
+ // sufficient, but out of an abundance of caution...
+ if replaced != nil {
+ replaced.copyMu.RLock()
+ defer replaced.copyMu.RUnlock()
+ }
+
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+ return err
+ }
+
+ newpop := vfs.PathOperation{
+ Root: newParent.upperVD,
+ Start: newParent.upperVD,
+ Path: fspath.Parse(newName),
+ }
+
+ needRecreateWhiteouts := false
+ cleanupRecreateWhiteouts := func() {
+ if !needRecreateWhiteouts {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+ }
+ }
+ }
+ if renamed.isDir() {
+ if replacedLayer == lookupLayerUpper {
+ // Remove whiteouts from the directory being replaced.
+ needRecreateWhiteouts = true
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ } else if replacedLayer == lookupLayerUpperWhiteout {
+ // We need to explicitly remove the whiteout since otherwise rename
+ // on the upper layer will fail with ENOTDIR.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ }
+
+ // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+ // regular rename and create the whiteout at the origin manually. Unlike
+ // RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+ // upper filesystem, but this is already the case for virtually all other
+ // overlay filesystem operations too.
+ oldpop := vfs.PathOperation{
+ Root: oldParent.upperVD,
+ Start: oldParent.upperVD,
+ Path: fspath.Parse(oldName),
+ }
+ if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+
+ // Below this point, the renamed dentry is now at newpop, and anything we
+ // replaced is gone forever. Commit the rename, update the overlay
+ // filesystem tree, and abandon attempts to recover from errors.
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+ delete(oldParent.children, oldName)
+ if replaced != nil {
+ ds = appendDentry(ds, replaced)
+ }
+ if oldParent != newParent {
+ newParent.dirents = nil
+ // This can't drop the last reference on oldParent because one is held
+ // by oldParentVD, so lock recursion is impossible.
+ oldParent.DecRef(ctx)
+ ds = appendDentry(ds, oldParent)
+ newParent.IncRef()
+ renamed.parent = newParent
+ }
+ renamed.name = newName
+ if newParent.children == nil {
+ newParent.children = make(map[string]*dentry)
+ }
+ newParent.children[newName] = renamed
+ oldParent.dirents = nil
+
+ if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+ }
+ if renamed.isDir() {
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+ }
+ }
+
+ vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+ return nil
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1083,7 +1324,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
Start: child.upperVD,
Path: fspath.Parse(whiteoutName),
}); err != nil && err != syserror.EEXIST {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
}
}
}
@@ -1113,15 +1354,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// Don't attempt to recover from this: the original directory is
// already gone, so any dentries representing it are invalid, and
// creating a new directory won't undo that.
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
- vfsObj.AbortDeleteDentry(&child.vfsd)
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
}
vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
delete(parent.children, name)
ds = appendDentry(ds, child)
parent.dirents = nil
+ parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
return nil
}
@@ -1129,12 +1369,25 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
return err
}
+ err = d.setStatLocked(ctx, rp, opts)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
+ }
+ return nil
+}
+// Precondition: d.fs.renameMu must be held for reading.
+func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
return err
@@ -1229,7 +1482,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -1322,70 +1575,175 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
}
if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
- if child != nil {
- vfsObj.AbortDeleteDentry(&child.vfsd)
- }
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
}
+ var cw *vfs.Watches
if child != nil {
vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
delete(parent.children, name)
ds = appendDentry(ds, child)
+ cw = &child.watches
}
+ vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
parent.dirents = nil
return nil
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+ return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
}
- // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
- // but not any other xattr syscalls. For now we just reject all of them.
- return nil, syserror.ENOTSUP
+
+ return fs.listXattr(ctx, d, size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+ if err != nil {
+ return nil, err
+ }
+
+ // Filter out all overlay attributes.
+ n := 0
+ for _, name := range names {
+ if !isOverlayXattr(name) {
+ names[n] = name
+ n++
+ }
+ }
+ return names[:n], err
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
}
- return "", syserror.ENOTSUP
+
+ return fs.getXattr(ctx, d, rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+ return "", err
+ }
+
+ // Return EOPNOTSUPP when fetching an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_get().
+ if isOverlayXattr(opts.Name) {
+ return "", syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
+}
+
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ return err
+ }
+
+ err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
if err != nil {
return err
}
- return syserror.ENOTSUP
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+ return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Return EOPNOTSUPP when setting an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_set().
+ if isOverlayXattr(opts.Name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ return err
+ }
+
+ err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
if err != nil {
return err
}
- return syserror.ENOTSUP
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+ return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+ // Linux passes the remove request to xattr_handler->set.
+ // See fs/xattr.c:vfs_removexattr().
+ if isOverlayXattr(name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index 268b32537..853aee951 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -38,6 +39,7 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
})
}
+// +stateify savable
type nonDirectoryFD struct {
fileDescription
@@ -46,7 +48,7 @@ type nonDirectoryFD struct {
// fileDescription.dentry().upperVD. cachedFlags is the last known value of
// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
// protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
copiedUp bool
cachedFD *vfs.FileDescription
cachedFlags uint32
@@ -146,6 +148,16 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux
return stat, nil
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
d := fd.dentry()
@@ -172,6 +184,9 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return err
}
d.updateAfterSetStatLocked(&opts)
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
return nil
}
@@ -256,10 +271,105 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- wrappedFD, err := fd.getCurrentFD(ctx)
+ if err := fd.ensureMappable(ctx, opts); err != nil {
+ return err
+ }
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+ d := fd.dentry()
+
+ // Fast path if we already have a Mappable for the current top layer.
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+
+ // Only permit mmap of regular files, since other file types may have
+ // unpredictable behavior when mmapped (e.g. /dev/zero).
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+ return syserror.ENODEV
+ }
+
+ // Get a Mappable for the current top layer.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ d.copyMu.RLock()
+ defer d.copyMu.RUnlock()
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
if err != nil {
return err
}
- defer wrappedFD.DecRef(ctx)
- return wrappedFD.ConfigureMMap(ctx, opts)
+ if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+ return err
+ }
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef(ctx)
+ opts.MappingIdentity = nil
+ }
+ // Use this Mappable for all mappings of this layer (unless we raced with
+ // another call to ensureMappable).
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ if d.wrappedMappable == nil {
+ d.wrappedMappable = opts.Mappable
+ atomic.StoreUint32(&d.isMappable, 1)
+ }
+ return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, ar, offset, writable)
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+ if !d.isCopiedUp() {
+ d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+ }
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+ }
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ d.dataMu.RLock()
+ defer d.dataMu.RUnlock()
+ return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ return d.wrappedMappable.InvalidateUnsavable(ctx)
}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 00562667f..dfbccd05f 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -22,6 +22,10 @@
// filesystem.renameMu
// dentry.dirMu
// dentry.copyMu
+// *** "memmap.Mappable locks" below this point
+// dentry.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
+// dentry.dataMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
@@ -37,6 +41,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -46,6 +51,8 @@ import (
const Name = "overlay"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -55,6 +62,8 @@ func (FilesystemType) Name() string {
// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
// FilesystemType.GetFilesystem.
+//
+// +stateify savable
type FilesystemOptions struct {
// Callers passing FilesystemOptions to
// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
@@ -71,6 +80,8 @@ type FilesystemOptions struct {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -93,7 +104,7 @@ type filesystem struct {
// renameMu synchronizes renaming with non-renaming operations in order to
// ensure consistent lock ordering between dentry.dirMu in different
// dentries.
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
// lastDirIno is the last inode number assigned to a directory. lastDirIno
// is accessed using atomic memory operations.
@@ -106,16 +117,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fsoptsRaw := opts.InternalData
fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
if fsoptsRaw != nil && !haveFSOpts {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
return nil, nil, syserror.EINVAL
}
if haveFSOpts {
if len(fsopts.LowerRoots) == 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
return nil, nil, syserror.EINVAL
}
if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
return nil, nil, syserror.EINVAL
}
// We don't enforce a maximum number of lower layers when not
@@ -132,7 +143,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, "workdir")
upperPath := fspath.Parse(upperPathname)
if !upperPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
return nil, nil, syserror.EINVAL
}
upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -144,13 +155,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer upperRoot.DecRef(ctx)
privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer privateUpperRoot.DecRef(ctx)
@@ -158,24 +169,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
lowerPathnamesStr, ok := mopts["lowerdir"]
if !ok {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
return nil, nil, syserror.EINVAL
}
delete(mopts, "lowerdir")
lowerPathnames := strings.Split(lowerPathnamesStr, ":")
const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
return nil, nil, syserror.EINVAL
}
if len(lowerPathnames) > maxLowerLayers {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
return nil, nil, syserror.EINVAL
}
for _, lowerPathname := range lowerPathnames {
lowerPath := fspath.Parse(lowerPathname)
if !lowerPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
return nil, nil, syserror.EINVAL
}
lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -187,13 +198,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer lowerRoot.DecRef(ctx)
privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer privateLowerRoot.DecRef(ctx)
@@ -201,7 +212,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
if len(mopts) != 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
return nil, nil, syserror.EINVAL
}
@@ -274,7 +285,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, syserror.EREMOTE
}
if isWhiteout(&rootStat) {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
root.destroyLocked(ctx)
fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
@@ -362,6 +373,8 @@ func (fs *filesystem) newDirIno() uint64 {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -394,7 +407,7 @@ type dentry struct {
// and dirents (if not nil) is a cache of dirents as returned by
// directoryFDs representing this directory. children is protected by
// dirMu.
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*dentry
dirents []vfs.Dirent
@@ -404,7 +417,7 @@ type dentry struct {
// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
// be copied up) with copyMu locked for writing; otherwise, it is
// immutable. lowerVDs is always immutable.
- copyMu sync.RWMutex
+ copyMu sync.RWMutex `state:"nosave"`
upperVD vfs.VirtualDentry
lowerVDs []vfs.VirtualDentry
@@ -419,7 +432,43 @@ type dentry struct {
devMinor uint32
ino uint64
+ // If this dentry represents a regular file, then:
+ //
+ // - mapsMu is used to synchronize between copy-up and memmap.Mappable
+ // methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+ //
+ // - dataMu is used to synchronize between copy-up and
+ // dentry.(memmap.Mappable).Translate.
+ //
+ // - lowerMappings tracks memory mappings of the file. lowerMappings is
+ // used to invalidate mappings of the lower layer when the file is copied
+ // up to ensure that they remain coherent with subsequent writes to the
+ // file. (Note that, as of this writing, Linux overlayfs does not do this;
+ // this feature is a gVisor extension.) lowerMappings is protected by
+ // mapsMu.
+ //
+ // - If this dentry is copied-up, then wrappedMappable is the Mappable
+ // obtained from a call to the current top layer's
+ // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+ // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil.
+ // wrappedMappable is protected by mapsMu and dataMu.
+ //
+ // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+ // accessed using atomic memory operations.
+ mapsMu sync.Mutex
+ lowerMappings memmap.MappingSet
+ dataMu sync.RWMutex
+ wrappedMappable memmap.Mappable
+ isMappable uint32
+
locks vfs.FileLocks
+
+ // watches is the set of inotify watches on the file repesented by this dentry.
+ //
+ // Note that hard links to the same file will not share the same set of
+ // watches, due to the fact that we do not have inode structures in this
+ // overlay implementation.
+ watches vfs.Watches
}
// newDentry creates a new dentry. The dentry initially has no references; it
@@ -479,6 +528,14 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
if atomic.LoadInt64(&d.refs) != 0 {
return
}
+
+ // Make sure that we do not lose watches on dentries that have not been
+ // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
+ // d.vfsd.IsDead() indicates that d was deleted.
+ if !d.vfsd.IsDead() && d.watches.Size() > 0 {
+ return
+ }
+
// Refs is still zero; destroy it.
d.destroyLocked(ctx)
return
@@ -507,6 +564,8 @@ func (d *dentry) destroyLocked(ctx context.Context) {
lowerVD.DecRef(ctx)
}
+ d.watches.HandleDeletion(ctx)
+
if d.parent != nil {
d.parent.dirMu.Lock()
if !d.vfsd.IsDead() {
@@ -525,19 +584,36 @@ func (d *dentry) destroyLocked(ctx context.Context) {
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
- // TODO(gvisor.dev/issue/1479): Implement inotify.
+ if d.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+ // that d was deleted.
+ deleted := d.vfsd.IsDead()
+
+ d.fs.renameMu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+ }
+ d.watches.Notify(ctx, "", events, cookie, et, deleted)
+ d.fs.renameMu.RUnlock()
}
// Watches implements vfs.DentryImpl.Watches.
func (d *dentry) Watches() *vfs.Watches {
- // TODO(gvisor.dev/issue/1479): Implement inotify.
- return nil
+ return &d.watches
}
// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) OnZeroWatches(context.Context) {}
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+ if atomic.LoadInt64(&d.refs) == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ }
+}
// iterLayers invokes yield on each layer comprising d, from top to bottom. If
// any call to yield returns false, iterLayer stops iteration.
@@ -570,6 +646,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
// statInternalMask is the set of stat fields that is set by
// dentry.statInternalTo().
const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -608,6 +694,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
// fileDescription is embedded by overlay implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -622,6 +710,48 @@ func (fd *fileDescription) dentry() *dentry {
return fd.vfsfd.Dentry().Impl().(*dentry)
}
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+ fs := fd.filesystem()
+ d := fd.dentry()
+
+ fs.renameMu.RLock()
+ err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+ fs.renameMu.RUnlock()
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+ fs := fd.filesystem()
+ d := fd.dentry()
+
+ fs.renameMu.RLock()
+ err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+ fs.renameMu.RUnlock()
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 7053ad6db..4e2da4810 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type filesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -43,6 +44,7 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
panic("pipefs.filesystemType.GetFilesystem should never be called")
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -76,6 +78,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
@@ -144,8 +148,8 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
}
// Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index a45b44440..2e086e34c 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -100,6 +100,7 @@ go_library(
"//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
+ "//pkg/tcpip/network/ipv4",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 03b5941b9..05d7948ea 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -41,6 +41,7 @@ func (FilesystemType) Name() string {
return Name
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -84,6 +85,8 @@ func (fs *filesystem) Release(ctx context.Context) {
// dynamicInode is an overfitted interface for common Inodes with
// dynamicByteSource types used in procfs.
+//
+// +stateify savable
type dynamicInode interface {
kernfs.Inode
vfs.DynamicBytesSource
@@ -99,6 +102,7 @@ func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.
return d
}
+// +stateify savable
type staticFile struct {
kernfs.DynamicBytesFile
vfs.StaticData
@@ -118,10 +122,13 @@ func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64
// InternalData contains internal data passed in to the procfs mount via
// vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
type InternalData struct {
Cgroups map[string]string
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index d57d94dbc..47ecd941c 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -68,8 +68,8 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -82,12 +82,10 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
if subTask.ThreadGroup() != i.task.ThreadGroup() {
return nil, syserror.ENOENT
}
-
- subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
- return subTaskDentry.VFSDentry(), nil
+ return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
@@ -118,6 +116,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
return offset, nil
}
+// +stateify savable
type subtasksFD struct {
kernfs.GenericDirectoryFD
@@ -155,21 +154,21 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
return fd.GenericDirectoryFD.SetStat(ctx, opts)
}
-// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &subtasksFD{task: i.task}
if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
}); err != nil {
return nil, err
}
- if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
@@ -181,12 +180,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
return stat, nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *subtasksInode) DecRef(context.Context) {
i.subtasksInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index dbdb5d929..a7cd6f57e 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -53,6 +53,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace
"auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
"cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
"comm": fs.newComm(task, fs.NextIno(), 0444),
+ "cwd": fs.newCwdSymlink(task, fs.NextIno()),
"environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
"exe": fs.newExeSymlink(task, fs.NextIno()),
"fd": fs.newFDDirInode(task),
@@ -106,9 +107,9 @@ func (i *taskInode) Valid(ctx context.Context) bool {
return i.task.ExitState() != kernel.TaskExitDead
}
-// Open implements kernfs.Inode.
-func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+// Open implements kernfs.Inode.Open.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
})
if err != nil {
@@ -117,18 +118,20 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
return fd.VFSFileDescription(), nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *taskInode) DecRef(context.Context) {
i.taskInodeRefs.DecRef(i.Destroy)
}
// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
// effective user and group.
+//
+// +stateify savable
type taskOwnedInode struct {
kernfs.Inode
@@ -168,7 +171,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
return d
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.Inode.Stat(ctx, fs, opts)
if err != nil {
@@ -186,7 +189,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
return stat, nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
mode := i.Mode()
uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 3f0d78461..0866cea2b 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -51,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
return true
}
+// +stateify savable
type fdDir struct {
locks vfs.FileLocks
@@ -62,7 +63,7 @@ type fdDir struct {
produceSymlink bool
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
@@ -86,14 +87,19 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
Name: strconv.FormatUint(uint64(fd), 10),
Type: typ,
Ino: i.fs.NextIno(),
- NextOff: offset + 1,
+ NextOff: int64(fd) + 3,
}
if err := cb.Handle(dirent); err != nil {
- return offset, err
+ // Getdents should iterate correctly despite mutation
+ // of fds, so we return the next fd to serialize plus
+ // 2 (which accounts for the "." and ".." tracked by
+ // kernfs) as the offset.
+ return int64(fd) + 2, err
}
- offset++
}
- return offset, nil
+ // We serialized them all. Next offset should be higher than last
+ // serialized fd.
+ return int64(fds[len(fds)-1]) + 3, nil
}
// fdDirInode represents the inode for /proc/[pid]/fd directory.
@@ -130,8 +136,8 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -140,13 +146,12 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
if !taskFDExists(ctx, i.task, fd) {
return nil, syserror.ENOENT
}
- taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
- return taskDentry.VFSDentry(), nil
+ return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil
}
-// Open implements kernfs.Inode.
-func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+// Open implements kernfs.Inode.Open.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
})
if err != nil {
@@ -155,7 +160,7 @@ func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.
return fd.VFSFileDescription(), nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
//
// This is to match Linux, which uses a special permission handler to guarantee
// that a process can still access /proc/self/fd after it has executed
@@ -177,7 +182,7 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
return err
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *fdDirInode) DecRef(context.Context) {
i.fdDirInodeRefs.DecRef(i.Destroy)
}
@@ -209,7 +214,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker
return d
}
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
return "", syserror.ENOENT
@@ -264,8 +269,8 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -278,13 +283,12 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
task: i.task,
fd: fd,
}
- dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
- return dentry.VFSDentry(), nil
+ return i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data), nil
}
-// Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+// Open implements kernfs.Inode.Open.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
})
if err != nil {
@@ -293,7 +297,7 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
return fd.VFSFileDescription(), nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *fdInfoDirInode) DecRef(context.Context) {
i.fdInfoDirInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 356036b9b..3fbf081a6 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -543,7 +543,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var vss, rss, data uint64
s.task.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
@@ -667,20 +667,24 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr
return d
}
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
- if !kernel.ContextCanTrace(ctx, s.task, false) {
- return "", syserror.EACCES
- }
-
- // Pull out the executable for /proc/[pid]/exe.
- exec, err := s.executable()
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+ exec, _, err := s.Getlink(ctx, nil)
if err != nil {
return "", err
}
defer exec.DecRef(ctx)
- return exec.PathnameWithDeleted(ctx), nil
+ root := vfs.RootFromContext(ctx)
+ if !root.Ok() {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
+ name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
+ return name, nil
}
// Getlink implements kernfs.Inode.Getlink.
@@ -688,23 +692,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
if !kernel.ContextCanTrace(ctx, s.task, false) {
return vfs.VirtualDentry{}, "", syserror.EACCES
}
-
- exec, err := s.executable()
- if err != nil {
- return vfs.VirtualDentry{}, "", err
- }
- defer exec.DecRef(ctx)
-
- vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
- vd.IncRef()
- return vd, "", nil
-}
-
-func (s *exeSymlink) executable() (file fsbridge.File, err error) {
if err := checkTaskState(s.task); err != nil {
- return nil, err
+ return vfs.VirtualDentry{}, "", err
}
+ var err error
+ var exec fsbridge.File
s.task.WithMuLocked(func(t *kernel.Task) {
mm := t.MemoryManager()
if mm == nil {
@@ -715,12 +708,78 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
// The MemoryManager may be destroyed, in which case
// MemoryManager.destroy will simply set the executable to nil
// (with locks held).
- file = mm.Executable()
- if file == nil {
+ exec = mm.Executable()
+ if exec == nil {
err = syserror.ESRCH
}
})
- return
+ if err != nil {
+ return vfs.VirtualDentry{}, "", err
+ }
+ defer exec.DecRef(ctx)
+
+ vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+ vd.IncRef()
+ return vd, "", nil
+}
+
+// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
+//
+// +stateify savable
+type cwdSymlink struct {
+ implStatFS
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeSymlink
+
+ task *kernel.Task
+}
+
+var _ kernfs.Inode = (*cwdSymlink)(nil)
+
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+ inode := &cwdSymlink{task: task}
+ inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+ d := &kernfs.Dentry{}
+ d.Init(inode)
+ return d
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+ cwd, _, err := s.Getlink(ctx, nil)
+ if err != nil {
+ return "", err
+ }
+ defer cwd.DecRef(ctx)
+
+ root := vfs.RootFromContext(ctx)
+ if !root.Ok() {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
+ name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
+ return name, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ if !kernel.ContextCanTrace(ctx, s.task, false) {
+ return vfs.VirtualDentry{}, "", syserror.EACCES
+ }
+ if err := checkTaskState(s.task); err != nil {
+ return vfs.VirtualDentry{}, "", err
+ }
+ cwd := s.task.FSContext().WorkingDirectoryVFS2()
+ if !cwd.Ok() {
+ // It could have raced with process deletion.
+ return vfs.VirtualDentry{}, "", syserror.ESRCH
+ }
+ return cwd, "", nil
}
// mountInfoData is used to implement /proc/[pid]/mountinfo.
@@ -785,6 +844,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
+// +stateify savable
type namespaceSymlink struct {
kernfs.StaticSymlink
@@ -807,15 +867,15 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
return d
}
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
if err := checkTaskState(s.task); err != nil {
return "", err
}
- return s.StaticSymlink.Readlink(ctx)
+ return s.StaticSymlink.Readlink(ctx, mnt)
}
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
if err := checkTaskState(s.task); err != nil {
return vfs.VirtualDentry{}, "", err
@@ -832,6 +892,8 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
// namespaceInode is a synthetic inode created to represent a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceInode struct {
implStatFS
kernfs.InodeAttrs
@@ -852,12 +914,12 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}
-// Open implements Inode.Open.
-func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &namespaceFD{inode: i}
i.IncRef()
fd.LockFD.Init(&i.locks)
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -865,6 +927,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
// namespace FD is a synthetic file that represents a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceFD struct {
vfs.FileDescriptionDefaultImpl
vfs.LockFD
@@ -875,20 +939,20 @@ type namespaceFD struct {
var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
return fd.inode.Stat(ctx, vfs, opts)
}
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
creds := auth.CredentialsFromContext(ctx)
return fd.inode.SetStat(ctx, vfs, creds, opts)
}
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (fd *namespaceFD) Release(ctx context.Context) {
fd.inode.DecRef(ctx)
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 4e69782c7..e7f748655 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -616,6 +616,7 @@ type netSnmpData struct {
var _ dynamicInode = (*netSnmpData)(nil)
+// +stateify savable
type snmpLine struct {
prefix string
header string
@@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string {
return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
}
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
types := []interface{}{
&inet.StatSNMPIP{},
@@ -709,7 +710,7 @@ type netRouteData struct {
var _ dynamicInode = (*netRouteData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +774,7 @@ type netStatData struct {
var _ dynamicInode = (*netStatData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 3ea00ab87..d8f5dd509 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -52,8 +52,8 @@ type tasksInode struct {
// '/proc/self' and '/proc/thread-self' have custom directory offsets in
// Linux. So handle them outside of OrderedChildren.
- selfSymlink *vfs.Dentry
- threadSelfSymlink *vfs.Dentry
+ selfSymlink *kernfs.Dentry
+ threadSelfSymlink *kernfs.Dentry
// cgroupControllers is a map of controller name to directory in the
// cgroup hierarchy. These controllers are immutable and will be listed
@@ -81,8 +81,8 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
inode := &tasksInode{
pidns: pidns,
fs: fs,
- selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
- threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
+ selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns),
+ threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns),
cgroupControllers: cgroupControllers,
}
inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
@@ -98,8 +98,8 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
return inode, dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) {
// Try to lookup a corresponding task.
tid, err := strconv.ParseUint(name, 10, 64)
if err != nil {
@@ -118,11 +118,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return nil, syserror.ENOENT
}
- taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
- return taskDentry.VFSDentry(), nil
+ return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
const FIRST_PROCESS_ENTRY = 256
@@ -200,9 +199,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
return maxTaskID, nil
}
-// Open implements kernfs.Inode.
-func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+// Open implements kernfs.Inode.Open.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
})
if err != nil {
@@ -229,7 +228,7 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
return stat, nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *tasksInode) DecRef(context.Context) {
i.tasksInodeRefs.DecRef(i.Destroy)
}
@@ -237,6 +236,8 @@ func (i *tasksInode) DecRef(context.Context) {
// staticFileSetStat implements a special static file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type staticFileSetStat struct {
dynamicBytesFileSetAttr
vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 8c41729e4..f268c59b0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type selfSymlink struct {
implStatFS
kernfs.InodeAttrs
@@ -51,7 +52,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns
return d
}
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -64,16 +65,17 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
return strconv.FormatUint(uint64(tgid), 10), nil
}
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+// +stateify savable
type threadSelfSymlink struct {
implStatFS
kernfs.InodeAttrs
@@ -94,7 +96,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64,
return d
}
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -108,12 +110,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
@@ -121,16 +123,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
// dynamicBytesFileSetAttr implements a special file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type dynamicBytesFileSetAttr struct {
kernfs.DynamicBytesFile
}
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
}
// cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
type cpuStats struct {
// user is time spent in userspace tasks with non-positive niceness.
user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 038a194c7..3312b0418 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -27,9 +27,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type tcpMemDir int
const (
@@ -67,6 +69,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
"tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
"tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
"tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+ "ip_forward": fs.newDentry(root, fs.NextIno(), 0444, &ipForwarding{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -174,7 +177,7 @@ type tcpSackData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if d.enabled == nil {
sack, err := d.stack.TCPSACKEnabled()
@@ -232,7 +235,7 @@ type tcpRecoveryData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
recovery, err := d.stack.TCPRecovery()
if err != nil {
@@ -284,7 +287,7 @@ type tcpMemData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
d.mu.Lock()
defer d.mu.Unlock()
@@ -354,3 +357,63 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
}
}
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+ kernfs.DynamicBytesFile
+
+ stack inet.Stack `state:"wait"`
+ enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if ipf.enabled == nil {
+ enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+ ipf.enabled = &enabled
+ }
+
+ val := "0\n"
+ if *ipf.enabled {
+ // Technically, this is not quite compatible with Linux. Linux stores these
+ // as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+ // Tough luck.
+ val = "1\n"
+ }
+ buf.WriteString(val)
+
+ return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Limit input size so as not to impact performance if input size is large.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ if ipf.enabled == nil {
+ ipf.enabled = new(bool)
+ }
+ *ipf.enabled = v != 0
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index be54897bb..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/usermem"
)
func newIPv6TestStack() *inet.TestStack {
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
t.Errorf("Got n.contents() = %v, want = %v", got, want)
}
}
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+ ctx := context.Background()
+ s := inet.NewTestStack()
+
+ var cases = []struct {
+ comment string
+ initial bool
+ str string
+ final bool
+ }{
+ {
+ comment: `Forwarding is disabled; write 1 and enable forwarding`,
+ initial: false,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is disabled; write 0 and disable forwarding`,
+ initial: false,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is enabled; write 1 and enable forwarding`,
+ initial: true,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 0 and disable forwarding`,
+ initial: true,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+ initial: false,
+ str: "2404",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+ initial: true,
+ str: "2404",
+ final: true,
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.comment, func(t *testing.T) {
+ s.IPForwarding = c.initial
+
+ file := &ipForwarding{stack: s, enabled: &c.initial}
+
+ // Write the values.
+ src := usermem.BytesIOSequence([]byte(c.str))
+ if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+ }
+
+ // Read the values from the stack and check them.
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index d82b3d2f3..6975af5a7 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -67,6 +67,7 @@ var (
taskStaticFiles = map[string]testutil.DirentType{
"auxv": linux.DT_REG,
"cgroup": linux.DT_REG,
+ "cwd": linux.DT_LNK,
"cmdline": linux.DT_REG,
"comm": linux.DT_REG,
"environ": linux.DT_REG,
@@ -104,7 +105,7 @@ func setup(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("NewMountNamespace(): %v", err)
}
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..bf11b425a 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -26,7 +26,9 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -43,7 +45,7 @@ type SignalFileDescription struct {
target *kernel.Task
// mu protects mask.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// mask is the signal mask. Protected by mu.
mask linux.SignalSet
@@ -83,7 +85,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
sfd.mask = mask
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
// Attempt to dequeue relevant signals.
info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -132,5 +134,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
sfd.target.SignalUnregister(entry)
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index 94a998568..29e5371d6 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,14 +28,16 @@ import (
)
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("sockfs.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
//
// Note that registering sockfs is unnecessary, except for the fact that it
// will not show up under /proc/filesystems as a result. This is a very minor
@@ -44,6 +46,7 @@ func (filesystemType) Name() string {
return "sockfs"
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -80,6 +83,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
@@ -88,7 +93,7 @@ type inode struct {
}
// Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return nil, syserror.ENXIO
}
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
index 73f3d3309..b75d70ae6 100644
--- a/pkg/sentry/fsimpl/sys/kcov.go
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -36,6 +36,8 @@ func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials)
}
// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
type kcovInode struct {
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
@@ -44,7 +46,7 @@ type kcovInode struct {
implStatFS
}
-func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
k := kernel.KernelFromContext(ctx)
if k == nil {
panic("KernelFromContext returned nil")
@@ -54,7 +56,7 @@ func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
kcov: k.NewKcov(),
}
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
DenyPRead: true,
DenyPWrite: true,
}); err != nil {
@@ -63,6 +65,7 @@ func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
return &fd.vfsfd, nil
}
+// +stateify savable
type kcovFD struct {
vfs.FileDescriptionDefaultImpl
vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 39952d2d0..1568c581f 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -34,9 +34,13 @@ const Name = "sysfs"
const defaultSysDirMode = linux.FileMode(0755)
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -117,6 +121,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dir implements kernfs.Inode.
+//
+// +stateify savable
type dir struct {
dirRefs
kernfs.InodeAttrs
@@ -148,8 +154,8 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
}
// Open implements kernfs.Inode.Open.
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndStaticEntries,
})
if err != nil {
@@ -169,6 +175,8 @@ func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, err
}
// cpuFile implements kernfs.Inode.
+//
+// +stateify savable
type cpuFile struct {
implStatFS
kernfs.DynamicBytesFile
@@ -190,6 +198,7 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li
return d
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+ mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create new mount namespace: %v", err)
}
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,8 +26,10 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
+//
+// +stateify savable
type TimerFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -62,7 +64,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
return &tfd.vfsfd, nil
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
const sizeofUint64 = 8
if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +130,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
tfd.timer.Resume()
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (tfd *TimerFileDescription) Release(context.Context) {
tfd.timer.Destroy()
}
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index e5a4218e8..5209a17af 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -376,7 +376,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index ac54d420d..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
+// +stateify savable
type deviceFile struct {
inode inode
kind vfs.DeviceKind
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 070c75e68..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// +stateify savable
type directory struct {
// Since directories can't be hard-linked, each directory can only be
// associated with a single dentry, which we can store in the directory
@@ -44,7 +45,7 @@ type directory struct {
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
// efficiently. childList is protected by iterMu.
- iterMu sync.Mutex
+ iterMu sync.Mutex `state:"nosave"`
childList dentryList
}
@@ -86,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e0de04e05..e39cd305b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -673,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
fs.mu.RUnlock()
return err
}
- if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
- fs.mu.RUnlock()
+ err = d.inode.setStat(ctx, rp.Credentials(), &opts)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
- fs.mu.RUnlock()
if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -770,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
@@ -792,59 +792,59 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
}
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
- return d.inode.listxattr(size)
+ return d.inode.listXattr(size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
- return d.inode.getxattr(rp.Credentials(), &opts)
+ return d.inode.getXattr(rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
d, err := resolveLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
return err
}
- if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
- fs.mu.RUnlock()
+ err = d.inode.setXattr(rp.Credentials(), &opts)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
- fs.mu.RUnlock()
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
d, err := resolveLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
return err
}
- if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
- fs.mu.RUnlock()
+ err = d.inode.removeXattr(rp.Credentials(), name)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
- fs.mu.RUnlock()
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
@@ -865,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
if d.parent == nil {
if d.name != "" {
- // This must be an anonymous memfd file.
+ // This file must have been created by
+ // newUnlinkedRegularFileDescription(). In Linux,
+ // mm/shmem.c:__shmem_file_setup() =>
+ // fs/file_table.c:alloc_file_pseudo() sets the created
+ // dentry's dentry_operations to anon_ops, for which d_dname ==
+ // simple_dname. fs/d_path.c:simple_dname() defines the
+ // dentry's pathname to be its name, prefixed with "/" and
+ // suffixed with " (deleted)".
b.PrependComponent("/" + d.name)
+ b.AppendString(" (deleted)")
return vfs.PrependPathSyntheticError{}
}
return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 5b0471ff4..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type namedPipe struct {
inode inode
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..be29a2363 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,7 +158,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..a199eb33d 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -36,12 +36,18 @@ import (
)
// regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
type regularFile struct {
inode inode
// memFile is a platform.File used to allocate pages to this regularFile.
memFile *pgalloc.MemoryFile
+ // memoryUsageKind is the memory accounting category under which pages backing
+ // this regularFile's contents are accounted.
+ memoryUsageKind usage.MemoryKind
+
// mapsMu protects mappings.
mapsMu sync.Mutex `state:"nosave"`
@@ -62,7 +68,7 @@ type regularFile struct {
writableMappingPages uint64
// dataMu protects the fields below.
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into memFile that store
// the file's data.
@@ -86,14 +92,75 @@ type regularFile struct {
func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &regularFile{
- memFile: fs.memFile,
- seals: linux.F_SEAL_SEAL,
+ memFile: fs.memFile,
+ memoryUsageKind: usage.Tmpfs,
+ seals: linux.F_SEAL_SEAL,
}
file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+ fs, ok := mount.Filesystem().Impl().(*filesystem)
+ if !ok {
+ panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+ }
+
+ inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+ d := fs.newDentry(inode)
+ defer d.DecRef(ctx)
+ d.name = name
+
+ fd := &regularFileFD{}
+ fd.Init(&inode.locks)
+ flags := uint32(linux.O_RDWR)
+ if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+ // Compare mm/shmem.c:shmem_zero_setup().
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+ if err != nil {
+ return nil, err
+ }
+ rf := fd.inode().impl.(*regularFile)
+ rf.memoryUsageKind = usage.Anonymous
+ rf.size = size
+ return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+ if err != nil {
+ return nil, err
+ }
+ if allowSeals {
+ fd.inode().impl.(*regularFile).seals = 0
+ }
+ return &fd.vfsfd, nil
+}
+
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
optional.End = pgend
}
- cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
@@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
return nil
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is accessed using atomic memory operations.
// offMu serializes operations that may mutate off.
off int64
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
@@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
case gap.Ok():
// Allocate memory for the write.
gapMR := gap.Range().Intersect(pgMR)
- fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
if err != nil {
retErr = err
goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 3ed650474..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,6 +21,8 @@ import (
)
// socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
type socketFile struct {
inode inode
ep transport.BoundEndpoint
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b0de5fabe..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index c4cec4130..cefec8fde 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -51,9 +51,13 @@ import (
const Name = "tmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -67,7 +71,7 @@ type filesystem struct {
devMinor uint32
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
nextInoMinusOne uint64 // accessed using atomic memory operations
}
@@ -78,6 +82,8 @@ func (FilesystemType) Name() string {
}
// FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
type FilesystemOpts struct {
// RootFileType is the FileType of the filesystem root. Valid values
// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -221,6 +227,8 @@ var globalStatfs = linux.Statfs{
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -300,6 +308,8 @@ func (d *dentry) Watches() *vfs.Watches {
func (d *dentry) OnZeroWatches(context.Context) {}
// inode represents a filesystem object.
+//
+// +stateify savable
type inode struct {
// fs is the owning filesystem. fs is immutable.
fs *filesystem
@@ -316,12 +326,12 @@ type inode struct {
// Inode metadata. Writing multiple fields atomically requires holding
// mu, othewise atomic operations can be used.
- mu sync.Mutex
- mode uint32 // file type and mode
- nlink uint32 // protected by filesystem.mu instead of inode.mu
- uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
- gid uint32 // auth.KGID, but ...
- ino uint64 // immutable
+ mu sync.Mutex `state:"nosave"`
+ mode uint32 // file type and mode
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
// Linux's tmpfs has no concept of btime.
atime int64 // nanoseconds
@@ -626,74 +636,50 @@ func (i *inode) touchCMtimeLocked() {
atomic.StoreInt64(&i.ctime, now)
}
-func (i *inode) listxattr(size uint64) ([]string, error) {
- return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+ return i.xattrs.ListXattr(size)
}
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- return i.xattrs.Getxattr(opts)
+ return i.xattrs.GetXattr(opts)
}
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- return i.xattrs.Setxattr(opts)
+ return i.xattrs.SetXattr(opts)
}
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- return i.xattrs.Removexattr(name)
+ return i.xattrs.RemoveXattr(name)
}
func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
- switch {
- case ats&vfs.MayRead == vfs.MayRead:
- if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
- return err
- }
- case ats&vfs.MayWrite == vfs.MayWrite:
- if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
- return err
- }
- default:
- panic(fmt.Sprintf("checkXattrPermissions called with impossible AccessTypes: %v", ats))
+ // We currently only support extended attributes in the user.* and
+ // trusted.* namespaces. See b/148380782.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
+ return syserror.EOPNOTSUPP
}
-
- switch {
- case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
- // The trusted.* namespace can only be accessed by privileged
- // users.
- if creds.HasCapability(linux.CAP_SYS_ADMIN) {
- return nil
- }
- if ats&vfs.MayWrite == vfs.MayWrite {
- return syserror.EPERM
- }
- return syserror.ENODATA
- case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
- // Extended attributes in the user.* namespace are only
- // supported for regular files and directories.
- filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
- if filetype == linux.S_IFREG || filetype == linux.S_IFDIR {
- return nil
- }
- if ats&vfs.MayWrite == vfs.MayWrite {
- return syserror.EPERM
- }
- return syserror.ENODATA
-
+ mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
}
- return syserror.EOPNOTSUPP
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}
// fileDescription is embedded by tmpfs implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -738,20 +724,20 @@ func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
return globalStatfs, nil
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.inode().listxattr(size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.inode().listXattr(size)
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
d := fd.dentry()
- if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
return err
}
@@ -760,10 +746,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
return nil
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
d := fd.dentry()
- if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
return err
}
@@ -772,37 +758,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
return nil
}
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
- fs, ok := mount.Filesystem().Impl().(*filesystem)
- if !ok {
- panic("NewMemfd() called with non-tmpfs mount")
- }
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
- // S_IRWXUGO.
- inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
- rf := inode.impl.(*regularFile)
- if allowSeals {
- rf.seals = 0
- }
-
- d := fs.newDentry(inode)
- defer d.DecRef(ctx)
- d.name = name
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
- // FMODE_READ | FMODE_WRITE.
- var fd regularFileFD
- fd.Init(&inode.locks)
- flags := uint32(linux.O_RDWR)
- if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
- }
- return &fd.vfsfd, nil
-}
-
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..99c8e3c0f 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,7 +41,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 326c4ed90..0ca750281 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
licenses(["notice"])
@@ -13,12 +13,35 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/marshal/primitive",
"//pkg/merkletree",
+ "//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
+
+go_test(
+ name = "verity_test",
+ srcs = [
+ "verity_test.go",
+ ],
+ library = ":verity",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/contexttest",
+ "//pkg/sentry/vfs",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 0e17dbddc..a560b0797 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -19,6 +19,7 @@ import (
"fmt"
"io"
"strconv"
+ "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -179,23 +180,19 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// corresponding Merkle tree file.
// This is the offset of the root hash for child in its parent's Merkle
// tree file.
- off, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+ off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
Root: child.lowerMerkleVD,
Start: child.lowerMerkleVD,
- }, &vfs.GetxattrOptions{
+ }, &vfs.GetXattrOptions{
Name: merkleOffsetInParentXattr,
- // Offset is a 32 bit integer.
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
// contains the expected xattrs. If the file or the xattr does not
// exist, it indicates unexpected modifications to the file system.
if err == syserror.ENOENT || err == syserror.ENODATA {
- if noCrashOnVerificationFailure {
- return nil, err
- }
- panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
}
if err != nil {
return nil, err
@@ -204,10 +201,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// unexpected modifications to the file system.
offset, err := strconv.Atoi(off)
if err != nil {
- if noCrashOnVerificationFailure {
- return nil, syserror.EINVAL
- }
- panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
}
// Open parent Merkle tree file to read and verify child's root hash.
@@ -221,10 +215,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// The parent Merkle tree file should have been created. If it's
// missing, it indicates an unexpected modification to the file system.
if err == syserror.ENOENT {
- if noCrashOnVerificationFailure {
- return nil, err
- }
- panic(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
}
if err != nil {
return nil, err
@@ -233,19 +224,16 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// dataSize is the size of raw data for the Merkle tree. For a file,
// dataSize is the size of the whole file. For a directory, dataSize is
// the size of all its children's root hashes.
- dataSize, err := parentMerkleFD.Getxattr(ctx, &vfs.GetxattrOptions{
+ dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
// contains the expected xattrs. If the file or the xattr does not
// exist, it indicates unexpected modifications to the file system.
if err == syserror.ENOENT || err == syserror.ENODATA {
- if noCrashOnVerificationFailure {
- return nil, err
- }
- panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
}
if err != nil {
return nil, err
@@ -255,10 +243,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// unexpected modifications to the file system.
parentSize, err := strconv.Atoi(dataSize)
if err != nil {
- if noCrashOnVerificationFailure {
- return nil, syserror.EINVAL
- }
- panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ return nil, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
}
fdReader := vfs.FileReadWriteSeeker{
@@ -270,11 +255,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// contain the root hash of the children in the parent Merkle tree when
// Verify returns with success.
var buf bytes.Buffer
- if err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF {
- if noCrashOnVerificationFailure {
- return nil, syserror.EIO
- }
- panic(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+ if _, err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF {
+ return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err))
}
// Cache child root hash when it's verified the first time.
@@ -370,10 +352,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// corresponding Merkle tree is found. This indicates an
// unexpected modification to the file system that
// removed/renamed the child.
- if noCrashOnVerificationFailure {
- return nil, childErr
- }
- panic(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+ return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
} else if childErr == nil && childMerkleErr == syserror.ENOENT {
// If in allowRuntimeEnable mode, and the Merkle tree file is
// not created yet, we create an empty Merkle tree file, so that
@@ -392,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
Path: fspath.Parse(childMerkleFilename),
}, &vfs.OpenOptions{
Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
})
if err != nil {
return nil, err
@@ -409,11 +389,16 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// If runtime enable is not allowed. This indicates an
// unexpected modification to the file system that
// removed/renamed the Merkle tree file.
- if noCrashOnVerificationFailure {
- return nil, childMerkleErr
- }
- panic(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+ return nil, alertIntegrityViolation(childMerkleErr, fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
}
+ } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+ // Both the child and the corresponding Merkle tree are missing.
+ // This could be an unexpected modification or due to incorrect
+ // parameter.
+ // TODO(b/167752508): Investigate possible ways to differentiate
+ // cases that both files are deleted from cases that they never
+ // exist in the file system.
+ return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
}
mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
@@ -572,8 +557,183 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- //TODO(b/159261227): Implement OpenAt.
- return nil, nil
+ // Verity fs is read-only.
+ if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+ return nil, syserror.EROFS
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+ // Open existing child or follow symlink.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Users should not open the Merkle tree files. Those are for verity fs
+ // use only.
+ if strings.Contains(d.name, merklePrefix) {
+ return nil, syserror.EPERM
+ }
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+
+ // Verity fs is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EROFS
+ }
+
+ // Get the path to the target file. This is only used to provide path
+ // information in failure case.
+ path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // Open the file in the underlying file system.
+ lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, opts)
+
+ // The file should exist, as we succeeded in finding its dentry. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("File %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // lowerFD needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity FD is successfully created.
+ defer lowerFD.DecRef(ctx)
+
+ // Open the Merkle tree file corresponding to the current file/directory
+ // to be used later for verifying Read/Walk.
+ merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The Merkle tree file should exist, as we succeeded in finding its
+ // dentry. If it's missing, it indicates an unexpected modification to
+ // the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // merkleReader needs to be cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is successfully created.
+ defer merkleReader.DecRef(ctx)
+
+ lowerFlags := lowerFD.StatusFlags()
+ lowerFDOpts := lowerFD.Options()
+ var merkleWriter *vfs.FileDescription
+ var parentMerkleWriter *vfs.FileDescription
+
+ // Only open the Merkle tree files for write if in allowRuntimeEnable
+ // mode.
+ if d.fs.allowRuntimeEnable {
+ merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+ // merkleWriter is cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is created successfully.
+ defer merkleWriter.DecRef(ctx)
+
+ if d.parent != nil {
+ parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.parent.lowerMerkleVD,
+ Start: d.parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+ }
+ return nil, err
+ }
+ // parentMerkleWriter is cleaned up if any error occurs. IncRef
+ // will be called if a verity FD is created successfully.
+ defer parentMerkleWriter.DecRef(ctx)
+ }
+ }
+
+ fd := &fileDescription{
+ d: d,
+ lowerFD: lowerFD,
+ merkleReader: merkleReader,
+ merkleWriter: merkleWriter,
+ parentMerkleWriter: parentMerkleWriter,
+ isDir: d.isDir(),
+ }
+
+ if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+ return nil, err
+ }
+ lowerFD.IncRef()
+ merkleReader.IncRef()
+ if merkleWriter != nil {
+ merkleWriter.IncRef()
+ }
+ if parentMerkleWriter != nil {
+ parentMerkleWriter.IncRef()
+ }
+ return &fd.vfsfd, err
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -649,7 +809,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
@@ -660,8 +820,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -670,14 +830,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, err
}
lowerVD := d.lowerVD
- return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: lowerVD,
Start: lowerVD,
}, size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -686,20 +846,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", err
}
lowerVD := d.lowerVD
- return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: lowerVD,
Start: lowerVD,
}, &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
// Verity file system is read-only.
return syserror.EROFS
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
// Verity file system is read-only.
return syserror.EROFS
}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index eedb5f484..fc5eabbca 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,16 +22,23 @@
package verity
import (
+ "fmt"
+ "strconv"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Name is the default filesystem name.
@@ -50,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset"
// whole file. For a directory, it's the size of all its children's root hashes.
const merkleSizeXattr = "user.merkle.size"
-// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes.
-const sizeOfInt32 = 4
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
// noCrashOnVerificationFailure indicates whether the sandbox should panic
// whenever verification fails. If true, an error is returned instead of
@@ -66,9 +74,13 @@ var noCrashOnVerificationFailure bool
var verityMu sync.RWMutex
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -93,11 +105,13 @@ type filesystem struct {
// renameMu synchronizes renaming with non-renaming operations in order
// to ensure consistent lock ordering between dentry.dirMu in different
// dentries.
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
}
// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
type InternalFilesystemOptions struct {
// RootMerkleFileName is the name of the verity root Merkle tree file.
RootMerkleFileName string
@@ -128,6 +142,16 @@ func (FilesystemType) Name() string {
return Name
}
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns an error, otherwise it panic.
+func alertIntegrityViolation(err error, msg string) error {
+ if noCrashOnVerificationFailure {
+ return err
+ }
+ panic(msg)
+}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
iopts, ok := opts.InternalData.(InternalFilesystemOptions)
@@ -141,6 +165,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// verity, and should not be exposed or connected.
mopts := &vfs.MountOptions{
GetFilesystemOptions: iopts.LowerGetFSOptions,
+ InternalMount: true,
}
mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
if err != nil {
@@ -197,15 +222,12 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, err
}
} else if err != nil {
- // Failed to get dentry for the root Merkle file. This indicates
- // an attack that removed/renamed the root Merkle file, or it's
- // never generated.
- if noCrashOnVerificationFailure {
- fs.vfsfs.DecRef(ctx)
- d.DecRef(ctx)
- return nil, nil, err
- }
- panic("Failed to find root Merkle file")
+ // Failed to get dentry for the root Merkle file. This
+ // indicates an unexpected modification that removed/renamed
+ // the root Merkle file, or it's never generated.
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, alertIntegrityViolation(err, "Failed to find root Merkle file")
}
d.lowerMerkleVD = lowerMerkleVD
@@ -243,6 +265,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -269,7 +293,7 @@ type dentry struct {
// and dirents (if not nil) is a cache of dirents as returned by
// directoryFDs representing this directory. children is protected by
// dirMu.
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*dentry
// lowerVD is the VirtualDentry in the underlying file system.
@@ -413,6 +437,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
// FileDescription is a wrapper of the underlying lowerFD, with support to build
// Merkle trees through the Linux fs-verity API to verify contents read from
// lowerFD.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -471,12 +497,247 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return syserror.EPERM
}
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root
+// hash of the generated Merkle tree and the data size is returned.
+// If fd points to a regular file, the data is the content of the file. If fd
+// points to a directory, the data is all root hahes of its children, written
+// to the Merkle tree file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+ merkleWriter := vfs.FileReadWriteSeeker{
+ FD: fd.merkleWriter,
+ Ctx: ctx,
+ }
+ var rootHash []byte
+ var dataSize uint64
+
+ switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+ case linux.S_IFREG:
+ // For a regular file, generate a Merkle tree based on its
+ // content.
+ var err error
+ stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+ dataSize = stat.Size
+
+ rootHash, err = merkletree.Generate(&fdReader, int64(dataSize), &merkleReader, &merkleWriter, false /* dataAndTreeInSameFile */)
+ if err != nil {
+ return nil, 0, err
+ }
+ case linux.S_IFDIR:
+ // For a directory, generate a Merkle tree based on the root
+ // hashes of its children that has already been written to the
+ // Merkle tree file.
+ merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+ dataSize = merkleStat.Size
+
+ rootHash, err = merkletree.Generate(&merkleReader, int64(dataSize), &merkleReader, &merkleWriter, true /* dataAndTreeInSameFile */)
+ if err != nil {
+ return nil, 0, err
+ }
+ default:
+ // TODO(b/167728857): Investigate whether and how we should
+ // enable other types of file.
+ return nil, 0, syserror.EINVAL
+ }
+ return rootHash, dataSize, nil
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its root hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
+ if !fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.EPERM
+ }
+
+ // Lock to prevent other threads performing enable or access the file
+ // while it's being enabled.
+ verityMu.Lock()
+ defer verityMu.Unlock()
+
+ // In allowRuntimeEnable mode, the underlying fd and read/write fd for
+ // the Merkle tree file should have all been initialized. For any file
+ // or directory other than the root, the parent Merkle tree file should
+ // have also been initialized.
+ if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+ return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds")
+ }
+
+ rootHash, dataSize, err := fd.generateMerkle(ctx)
+ if err != nil {
+ return 0, err
+ }
+
+ if fd.parentMerkleWriter != nil {
+ stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return 0, err
+ }
+
+ // Write the root hash of fd to the parent directory's Merkle
+ // tree file, as it should be part of the parent Merkle tree
+ // data. parentMerkleWriter is open with O_APPEND, so it
+ // should write directly to the end of the file.
+ if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil {
+ return 0, err
+ }
+
+ // Record the offset of the root hash of fd in parent directory's
+ // Merkle tree file.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Value: strconv.Itoa(int(stat.Size)),
+ }); err != nil {
+ return 0, err
+ }
+ }
+
+ // Record the size of the data being hashed for fd.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleSizeXattr,
+ Value: strconv.Itoa(int(dataSize)),
+ }); err != nil {
+ return 0, err
+ }
+ fd.d.rootHash = append(fd.d.rootHash, rootHash...)
+ return 0, nil
+}
+
+// measureVerity returns the root hash of fd, saved in args[2].
+func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ var metadata linux.DigestMetadata
+
+ // If allowRuntimeEnable is true, an empty fd.d.rootHash indicates that
+ // verity is not enabled for the file. If allowRuntimeEnable is false,
+ // this is an integrity violation because all files should have verity
+ // enabled, in which case fd.d.rootHash should be set.
+ if len(fd.d.rootHash) == 0 {
+ if fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.ENODATA
+ }
+ return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no root hash found")
+ }
+
+ // The first part of VerityDigest is the metadata.
+ if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+ return 0, err
+ }
+ if metadata.DigestSize < uint16(len(fd.d.rootHash)) {
+ return 0, syserror.EOVERFLOW
+ }
+
+ // Populate the output digest size, since DigestSize is both input and
+ // output.
+ metadata.DigestSize = uint16(len(fd.d.rootHash))
+
+ // First copy the metadata.
+ if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+ return 0, err
+ }
+
+ // Now copy the root hash bytes to the memory after metadata.
+ _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.rootHash)
+ return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
+ f := int32(0)
+
+ // All enabled files should store a root hash. This flag is not settable
+ // via FS_IOC_SETFLAGS.
+ if len(fd.d.rootHash) != 0 {
+ f |= linux.FS_VERITY_FL
+ }
+
+ t := kernel.TaskFromContext(ctx)
+ _, err := primitive.CopyInt32Out(t, flags, f)
+ return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FS_IOC_ENABLE_VERITY:
+ return fd.enableVerity(ctx, uio)
+ case linux.FS_IOC_MEASURE_VERITY:
+ return fd.measureVerity(ctx, uio, args[2].Pointer())
+ case linux.FS_IOC_GETFLAGS:
+ return fd.verityFlags(ctx, uio, args[2].Pointer())
+ default:
+ // TODO(b/169682228): Investigate which ioctl commands should
+ // be allowed.
+ return 0, syserror.ENOSYS
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // No need to verify if the file is not enabled yet in
+ // allowRuntimeEnable mode.
+ if fd.d.fs.allowRuntimeEnable && len(fd.d.rootHash) == 0 {
+ return fd.lowerFD.PRead(ctx, dst, offset, opts)
+ }
+
+ // dataSize is the size of the whole file.
+ dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the xattr does not exist, it
+ // indicates unexpected modifications to the file system.
+ if err == syserror.ENODATA {
+ return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ size, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ }
+
+ dataReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+
+ n, err := merkletree.Verify(dst.Writer(ctx), &dataReader, &merkleReader, int64(size), offset, dst.NumBytes(), fd.d.rootHash, false /* dataAndTreeInSameFile */)
+ if err != nil {
+ return 0, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Verification failed: %v", err))
+ }
+ return n, err
+}
+
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
- return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+ return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block)
}
// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
- return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+ return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
new file mode 100644
index 000000000..9b5641092
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -0,0 +1,349 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "fmt"
+ "io"
+ "math/rand"
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rootMerkleFilename is the name of the root Merkle tree file.
+const rootMerkleFilename = "root.verity"
+
+// maxDataSize is the maximum data size written to the file for test.
+const maxDataSize = 100000
+
+// newVerityRoot creates a new verity mount, and returns the root. The
+// underlying file system is tmpfs. If the error is not nil, then cleanup
+// should be called when the root is no longer needed.
+func newVerityRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
+ rand.Seed(time.Now().UnixNano())
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(ctx); err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+ }
+
+ vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+
+ mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: InternalFilesystemOptions{
+ RootMerkleFileName: rootMerkleFilename,
+ LowerName: "tmpfs",
+ AllowRuntimeEnable: true,
+ NoCrashOnVerificationFailure: true,
+ },
+ },
+ })
+ if err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create verity root mount: %v", err)
+ }
+ root := mntns.Root()
+ return vfsObj, root, func() {
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
+ }, nil
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+ creds := auth.CredentialsFromContext(ctx)
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+
+ // Create the file in the underlying file system.
+ lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(filePath),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.ModeRegular | mode,
+ })
+ if err != nil {
+ return nil, 0, err
+ }
+
+ // Generate random data to be written to the file.
+ dataSize := rand.Intn(maxDataSize) + 1
+ data := make([]byte, dataSize)
+ rand.Read(data)
+
+ // Write directly to the underlying FD, since verity FD is read-only.
+ n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ if n != int64(len(data)) {
+ return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data))
+ }
+
+ lowerFD.DecRef(ctx)
+
+ // Now open the verity file descriptor.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filePath),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular | mode,
+ })
+ return fd, dataSize, err
+}
+
+// corruptRandomBit randomly flips a bit in the file represented by fd.
+func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
+ // Flip a random bit in the underlying file.
+ randomPos := int64(rand.Intn(size))
+ byteToModify := make([]byte, 1)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
+ return fmt.Errorf("lowerFD.PRead failed: %v", err)
+ }
+ byteToModify[0] ^= 1
+ if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil {
+ return fmt.Errorf("lowerFD.PWrite failed: %v", err)
+ }
+ return nil
+}
+
+// TestOpen ensures that when a file is created, the corresponding Merkle tree
+// file and the root Merkle tree file exist.
+func TestOpen(t *testing.T) {
+ ctx := contexttest.Context(t)
+ vfsObj, root, cleanup, err := newVerityRoot(ctx)
+ if err != nil {
+ t.Fatalf("Failed to create new verity root: %v", err)
+ }
+ defer cleanup()
+
+ filename := "verity-test-file"
+ if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+ t.Fatalf("Failed to create new file fd: %v", err)
+ }
+
+ // Ensure that the corresponding Merkle tree file is created.
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("Failed to open Merkle tree file %s: %v", merklePrefix+filename, err)
+ }
+
+ // Ensure the root merkle tree file is created.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + rootMerkleFilename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("Failed to open root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+ }
+}
+
+// TestUntouchedFileSucceeds ensures that read from an untouched verity file
+// succeeds after enabling verity for it.
+func TestReadUntouchedFileSucceeds(t *testing.T) {
+ ctx := contexttest.Context(t)
+ vfsObj, root, cleanup, err := newVerityRoot(ctx)
+ if err != nil {
+ t.Fatalf("Failed to create new verity root: %v", err)
+ }
+ defer cleanup()
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("Failed to create new file fd: %v", err)
+ }
+
+ // Enable verity on the file and confirm a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl failed: %v", err)
+ }
+
+ buf := make([]byte, size)
+ n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.PRead failed: %v", err)
+ }
+
+ if n != int64(size) {
+ t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ }
+}
+
+// TestReopenUntouchedFileSucceeds ensures that reopen an untouched verity file
+// succeeds after enabling verity for it.
+func TestReopenUntouchedFileSucceeds(t *testing.T) {
+ ctx := contexttest.Context(t)
+ vfsObj, root, cleanup, err := newVerityRoot(ctx)
+ if err != nil {
+ t.Fatalf("Failed to create new verity root: %v", err)
+ }
+ defer cleanup()
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("Failed to create new file fd: %v", err)
+ }
+
+ // Enable verity on the file and confirms a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl failed: %v", err)
+ }
+
+ // Ensure reopening the verity enabled file succeeds.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err != nil {
+ t.Errorf("reopen enabled file failed: %v", err)
+ }
+}
+
+// TestModifiedFileFails ensures that read from a modified verity file fails.
+func TestModifiedFileFails(t *testing.T) {
+ ctx := contexttest.Context(t)
+ vfsObj, root, cleanup, err := newVerityRoot(ctx)
+ if err != nil {
+ t.Fatalf("Failed to create new verity root: %v", err)
+ }
+ defer cleanup()
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("Failed to create new file fd: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl failed: %v", err)
+ }
+
+ // Open a new lowerFD that's read/writable.
+ lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+ lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("Open lowerFD failed: %v", err)
+ }
+
+ if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+ t.Fatalf("%v", err)
+ }
+
+ // Confirm that read from the modified file fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ t.Fatalf("fd.PRead succeeded with modified file")
+ }
+}
+
+// TestModifiedMerkleFails ensures that read from a verity file fails if the
+// corresponding Merkle tree file is modified.
+func TestModifiedMerkleFails(t *testing.T) {
+ ctx := contexttest.Context(t)
+ vfsObj, root, cleanup, err := newVerityRoot(ctx)
+ if err != nil {
+ t.Fatalf("Failed to create new verity root: %v", err)
+ }
+ defer cleanup()
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("Failed to create new file fd: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl failed: %v", err)
+ }
+
+ // Open a new lowerMerkleFD that's read/writable.
+ lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("Open lowerMerkleFD failed: %v", err)
+ }
+
+ // Flip a random bit in the Merkle tree file.
+ stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("Failed to get lowerMerkleFD stat: %v", err)
+ }
+ merkleSize := int(stat.Size)
+ if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+ t.Fatalf("%v", err)
+ }
+
+ // Confirm that read from a file with modified Merkle tree fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ fmt.Println(buf)
+ t.Fatalf("fd.PRead succeeded with modified Merkle file")
+ }
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 07bf39fed..5bba9de0b 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -15,6 +15,7 @@ go_library(
],
deps = [
"//pkg/context",
+ "//pkg/tcpip",
"//pkg/tcpip/stack",
],
)
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index c0b4831d1..fbe6d6aa6 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -15,7 +15,10 @@
// Package inet defines semantics for IP stacks.
package inet
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
// Stack represents a TCP/IP stack.
type Stack interface {
@@ -80,6 +83,12 @@ type Stack interface {
// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
// for restoring a stack after a save.
RestoreCleanupEndpoints([]stack.TransportEndpoint)
+
+ // Forwarding returns if packet forwarding between NICs is enabled.
+ Forwarding(protocol tcpip.NetworkProtocolNumber) bool
+
+ // SetForwarding enables or disables packet forwarding between NICs.
+ SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error
}
// Interface contains information about a network interface.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 9771f01fc..1779cc6f3 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -14,7 +14,10 @@
package inet
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
// TestStack is a dummy implementation of Stack for tests.
type TestStack struct {
@@ -26,6 +29,7 @@ type TestStack struct {
TCPSendBufSize TCPBufferSize
TCPSACKFlag bool
Recovery TCPLossRecovery
+ IPForwarding bool
}
// NewTestStack returns a TestStack with no network interfaces. The value of
@@ -128,3 +132,14 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ return s.IPForwarding
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+ s.IPForwarding = enable
+ return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index d436daab4..083071b5e 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -69,8 +69,8 @@ go_template_instance(
prefix = "socket",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*SocketEntry",
- "Linker": "*SocketEntry",
+ "Element": "*SocketRecordVFS1",
+ "Linker": "*SocketRecordVFS1",
},
)
@@ -197,6 +197,7 @@ go_library(
"gvisor.dev/gvisor/pkg/sentry/device",
"gvisor.dev/gvisor/pkg/tcpip",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
":uncaught_signal_go_proto",
@@ -212,6 +213,8 @@ go_library(
"//pkg/eventchannel",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/refs",
"//pkg/refs_vfs2",
@@ -261,7 +264,6 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
"id_map_set.go",
"user_namespace.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
)
// UID is a user ID in an unspecified user namespace.
+//
+// +marshal
type UID uint32
// GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
type GID uint32
// In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 5773244ac..0ec7344cd 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -111,8 +111,11 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
ctx := context.Background()
f.init() // Initialize table.
+ f.used = 0
for fd, d := range m {
- f.setAll(fd, d.file, d.fileVFS2, d.flags)
+ if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
+ }
// Note that we do _not_ need to acquire a extra table reference here. The
// table reference will already be accounted for in the file, so we drop the
@@ -127,7 +130,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
}
// drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
// Release locks.
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
@@ -145,14 +148,13 @@ func (f *FDTable) drop(file *fs.File) {
d.InotifyEvent(ev, 0)
// Drop the table reference.
- file.DecRef(context.Background())
+ file.DecRef(ctx)
}
// dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
// entire file.
- ctx := context.Background()
err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
if err != nil && err != syserror.ENOLCK {
panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
@@ -187,12 +189,6 @@ func (f *FDTable) DecRef(ctx context.Context) {
})
}
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
- size := atomic.LoadInt32(&f.used)
- return int(size)
-}
-
// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
@@ -279,7 +275,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -289,15 +284,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
- f.set(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.set(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.set(i, nil, FDFlags{}) // Zap entry.
+ f.set(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.set() that
+ // originally installed the file. Don't call f.drop()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -307,6 +312,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -334,7 +340,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -344,15 +349,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.getVFS2(i); d == nil {
- f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.setVFS2(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ f.setVFS2(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.setVFS2() that
+ // originally installed the file. Don't call f.dropVFS2()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -362,6 +377,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -397,7 +413,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
}
for fd < end {
if d, _, _ := f.getVFS2(fd); d == nil {
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
if fd == f.next {
// Update next search start position.
f.next = fd + 1
@@ -413,40 +429,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
- return f.newFDAt(ctx, fd, file, nil, flags)
+ df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+ if err != nil {
+ return err
+ }
+ if df != nil {
+ f.drop(ctx, df)
+ }
+ return nil
}
// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
- return f.newFDAt(ctx, fd, nil, file, flags)
+ _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+ if err != nil {
+ return err
+ }
+ if dfVFS2 != nil {
+ f.dropVFS2(ctx, dfVFS2)
+ }
+ return nil
}
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
if fd < 0 {
// Don't accept negative FDs.
- return syscall.EBADF
+ return nil, nil, syscall.EBADF
}
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
- return syscall.EMFILE
+ return nil, nil, syscall.EMFILE
}
}
// Install the entry.
f.mu.Lock()
defer f.mu.Unlock()
- f.setAll(fd, file, fileVFS2, flags)
- return nil
+
+ df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+ return df, dfVFS2, nil
}
// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -462,14 +493,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.set(fd, file, flags)
+ f.set(ctx, fd, file, flags)
return nil
}
// SetFlagsVFS2 sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -485,7 +516,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
return nil
}
@@ -551,30 +582,6 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 {
return fds
}
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs(ctx context.Context) []*fs.File {
- files := make([]*fs.File, 0, f.Size())
- f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription {
- files := make([]*vfs.FileDescription, 0, f.Size())
- f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
// Fork returns an independent FDTable.
func (f *FDTable) Fork(ctx context.Context) *FDTable {
clone := f.k.NewFDTable()
@@ -582,11 +589,8 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- switch {
- case file != nil:
- clone.set(fd, file, flags)
- case fileVFS2 != nil:
- clone.setVFS2(fd, fileVFS2, flags)
+ if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
}
})
return clone
@@ -595,13 +599,12 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
return nil, nil
}
f.mu.Lock()
- defer f.mu.Unlock()
// Update current available position.
if fd < f.next {
@@ -617,24 +620,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
case orig2 != nil:
orig2.IncRef()
}
+
if orig != nil || orig2 != nil {
- f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
+ }
+ f.mu.Unlock()
+
+ if orig != nil {
+ f.drop(ctx, orig)
}
+ if orig2 != nil {
+ f.dropVFS2(ctx, orig2)
+ }
+
return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
- f.mu.Lock()
- defer f.mu.Unlock()
+ // TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+ var files []*fs.File
+ var filesVFS2 []*vfs.FileDescription
+ f.mu.Lock()
f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
- f.set(fd, nil, FDFlags{}) // Clear from table.
+ df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+ if df != nil {
+ files = append(files, df)
+ }
+ if dfVFS2 != nil {
+ filesVFS2 = append(filesVFS2, dfVFS2)
+ }
// Update current available position.
if fd < f.next {
f.next = fd
}
}
})
+ f.mu.Unlock()
+
+ for _, file := range files {
+ f.drop(ctx, file)
+ }
+
+ for _, file := range filesVFS2 {
+ f.dropVFS2(ctx, file)
+ }
}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index e3f30ba2a..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
}
i := int32(2)
- fdTable.Remove(i)
+ fdTable.Remove(ctx, i)
if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
} else {
for _, fd := range fds {
- fdTable.Remove(fd)
+ fdTable.Remove(ctx, fd)
}
}
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
}
- ref, _ := fdTable.Remove(1)
+ ref, _ := fdTable.Remove(ctx, 1)
if ref == nil {
t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
}
ref.DecRef(ctx)
- if ref, _ := fdTable.Remove(1); ref != nil {
+ if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
}
})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 6b8feb107..da79e6627 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -78,33 +79,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
return d.file, d.fileVFS2, d.flags, true
}
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
- f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+ dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+ return dropFile
}
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
- f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+ _, dropFile := f.setAll(ctx, fd, nil, file, flags)
+ return dropFile
}
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
//
// Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
if file != nil && fileVFS2 != nil {
panic("VFS1 and VFS2 files set")
}
@@ -147,25 +152,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
}
}
- // Drop the table reference.
+ // Adjust used.
+ switch {
+ case orig == nil && desc != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && desc == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+
if orig != nil {
switch {
case orig.file != nil:
if desc == nil || desc.file != orig.file {
- f.drop(orig.file)
+ return orig.file, nil
}
case orig.fileVFS2 != nil:
if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
- f.dropVFS2(orig.fileVFS2)
+ return nil, orig.fileVFS2
}
}
}
-
- // Adjust used.
- switch {
- case orig == nil && desc != nil:
- atomic.AddInt32(&f.used, 1)
- case orig != nil && desc == nil:
- atomic.AddInt32(&f.used, -1)
- }
+ return nil, nil
}
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
index aad63aa99..d3e76ca7b 100644
--- a/pkg/sentry/kernel/kcov.go
+++ b/pkg/sentry/kernel/kcov.go
@@ -89,6 +89,10 @@ func (kcov *Kcov) TaskWork(t *Task) {
kcov.mu.Lock()
defer kcov.mu.Unlock()
+ if kcov.mode != linux.KCOV_TRACE_PC {
+ return
+ }
+
rw := &kcovReadWriter{
mf: kcov.mfp.MemoryFile(),
fr: kcov.mappable.FileRange(),
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 402aa1718..d6c21adb7 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -220,13 +220,18 @@ type Kernel struct {
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
- // sockets is the list of all network sockets the system. Protected by
- // extMu.
+ // sockets is the list of all network sockets in the system.
+ // Protected by extMu.
+ // TODO(gvisor.dev/issue/1624): Only used by VFS1.
sockets socketList
- // nextSocketEntry is the next entry number to use in sockets. Protected
+ // socketsVFS2 records all network sockets in the system. Protected by
+ // extMu.
+ socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+ // nextSocketRecord is the next entry number to use in sockets. Protected
// by extMu.
- nextSocketEntry uint64
+ nextSocketRecord uint64
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -414,6 +419,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
return fmt.Errorf("failed to create sockfs mount: %v", err)
}
k.socketMount = socketMount
+
+ k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
}
return nil
@@ -507,6 +514,10 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
+ if VFS2Enabled {
+ return nil // Not relevant.
+ }
+
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -533,11 +544,6 @@ func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
//
// Precondition: Must be called with the kernel paused.
func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -556,6 +562,10 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+ if VFS2Enabled {
+ return nil
+ }
+
return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
@@ -888,17 +898,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
opener fsbridge.Lookup
fsContext *FSContext
mntns *fs.MountNamespace
+ mntnsVFS2 *vfs.MountNamespace
)
if VFS2Enabled {
- mntnsVFS2 := args.MountNamespaceVFS2
+ mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
}
// Get the root directory from the MountNamespace.
- root := args.MountNamespaceVFS2.Root()
+ root := mntnsVFS2.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef(ctx)
@@ -1008,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
- MountNamespaceVFS2: args.MountNamespaceVFS2,
+ MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
t, err := k.tasks.NewTask(config)
@@ -1508,20 +1519,27 @@ func (k *Kernel) SupervisorContext() context.Context {
}
}
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+ k *Kernel
+ Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+ SockVFS2 *vfs.FileDescription // Only used by VFS2.
+ ID uint64 // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- SockVFS2 *vfs.FileDescription
- ID uint64 // Socket table entry number.
+ SocketRecord
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone(context.Context) {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
s.k.extMu.Lock()
s.k.sockets.Remove(s)
s.k.extMu.Unlock()
@@ -1532,9 +1550,14 @@ func (s *SocketEntry) WeakRefGone(context.Context) {
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{k: k, ID: id}
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecordVFS1{
+ SocketRecord: SocketRecord{
+ k: k,
+ ID: id,
+ },
+ }
s.Sock = refs.NewWeakRef(sock, s)
k.sockets.PushBack(s)
k.extMu.Unlock()
@@ -1546,29 +1569,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
// Precondition: Caller must hold a reference to sock.
//
// Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{
+ if _, ok := k.socketsVFS2[sock]; ok {
+ panic(fmt.Sprintf("Socket %p added twice", sock))
+ }
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecord{
k: k,
ID: id,
SockVFS2: sock,
}
- k.sockets.PushBack(s)
+ k.socketsVFS2[sock] = s
+ k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ delete(k.socketsVFS2, sock)
k.extMu.Unlock()
}
// ListSockets returns a snapshot of all sockets.
//
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
// to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
k.extMu.Lock()
- var socks []*SocketEntry
- for s := k.sockets.Front(); s != nil; s = s.Next() {
- socks = append(socks, s)
+ var socks []*SocketRecord
+ if VFS2Enabled {
+ for _, s := range k.socketsVFS2 {
+ socks = append(socks, s)
+ }
+ } else {
+ for s := k.sockets.Front(); s != nil; s = s.Next() {
+ socks = append(socks, &s.SocketRecord)
+ }
}
k.extMu.Unlock()
return socks
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 449643118..99134e634 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/amutex",
"//pkg/buffer",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index c410c96aa..67beb0ad6 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,6 +17,7 @@ package pipe
import (
"fmt"
+ "io"
"sync/atomic"
"syscall"
@@ -215,7 +216,7 @@ func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
if p.view.Size() == 0 {
if !p.HasWriters() {
// There are no writers, return EOF.
- return 0, nil
+ return 0, io.EOF
}
return 0, syserror.ErrWouldBlock
}
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 6d58b682f..f665920cb 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
v = math.MaxInt32 // Silently truncate.
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ iocc := primitive.IOCopyContext{
+ IO: io,
+ Ctx: ctx,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }
+ _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
return 0, err
default:
return 0, syscall.ENOTTY
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index f223d59e1..f61039f5b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+ return syserror.ESPIPE
+}
+
// Open opens the pipe represented by vp.
func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 50df179c3..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
@@ -999,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
- IgnorePermissions: true,
- }); err != nil {
+ if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
- _, err := t.CopyOut(data, word)
+ _, err := word.CopyOut(t, data)
return err
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
- _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
- IgnorePermissions: true,
- })
+ word := t.Arch().Native(uintptr(data))
+ _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
@@ -1078,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if target.ptraceSiginfo == nil {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.ptraceSiginfo)
+ _, err := target.ptraceSiginfo.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGINFO:
var info arch.SignalInfo
- if _, err := t.CopyIn(data, &info); err != nil {
+ if _, err := info.CopyIn(t, data); err != nil {
return err
}
t.tg.pidns.owner.mu.RLock()
@@ -1098,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if addr != linux.SignalSetSize {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.SignalMask())
+ mask := target.SignalMask()
+ _, err := mask.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGMASK:
@@ -1106,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
return syserror.EINVAL
}
var mask linux.SignalSet
- if _, err := t.CopyIn(data, &mask); err != nil {
+ if _, err := mask.CopyIn(t, data); err != nil {
return err
}
// The target's task goroutine is stopped, so this is safe:
@@ -1121,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_GETEVENTMSG:
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
- _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+ _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
return err
// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
if err != nil {
return err
}
- _, err = t.CopyOut(data, n)
+ _, err = n.CopyOut(t, data)
return err
case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
return fmt.Sprintf("sys_%d", sysno) // Unlikely.
}
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+ for i, syscall := range s.Table {
+ if syscall.Name == name {
+ return uintptr(i), nil
+ }
+ }
+ return 0, fmt.Errorf("syscall %q not found", name)
+}
+
// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 9d7a9128f..fce1064a7 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -341,12 +341,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt.SetClearTID(opts.ChildTID)
}
if opts.ChildSetTID {
- // Can't use Task.CopyOut, which assumes AddressSpaceActive.
- usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+ ctid := nt.ThreadID()
+ ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
- t.CopyOut(opts.ParentTID, ntid)
+ ntid.CopyOut(t, opts.ParentTID)
}
kind := ptraceCloneKindClone
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9fa528384..d1136461a 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Stack() *arch.Stack {
- return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+ return &arch.Stack{
+ Arch: t.Arch(),
+ IO: t.MemoryManager(),
+ Bottom: usermem.Addr(t.Arch().Stack()),
+ }
}
// LoadTaskImage loads a specified file into a new TaskContext.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b76f7f503..b400a8b41 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -248,7 +248,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
t.tg.signalHandlers.mu.Unlock()
if !signaled {
- if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ zero := ThreadID(0)
+ if _, err := zero.CopyOut(t, t.cleartid); err == nil {
t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
}
// If the CopyOut fails, there's nothing we can do.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 4b535c949..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -87,7 +88,7 @@ func (t *Task) exitRobustList() {
return
}
- next := rl.List
+ next := primitive.Uint64(rl.List)
done := 0
var pendingLockAddr usermem.Addr
if rl.ListOpPending != 0 {
@@ -99,12 +100,12 @@ func (t *Task) exitRobustList() {
// We traverse to the next element of the list before we
// actually wake anything. This prevents the race where waking
// this futex causes a modification of the list.
- thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+ thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
// Try to decode the next element in the list before waking the
// current futex. But don't check the error until after we've
// woken the current futex. Linux does it in this order too
- _, nextErr := t.CopyIn(usermem.Addr(next), &next)
+ _, nextErr := next.CopyIn(t, usermem.Addr(next))
// Wakeup the current futex if it's not pending.
if thisLockAddr != pendingLockAddr {
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index aa3a573c0..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -141,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
- _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+ _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index feaa38596..ebdb83061 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -259,7 +259,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
// Set up the signal handler. If we have a saved signal mask, the signal
// handler should run with the current mask, but sigreturn should restore
// the saved one.
- st := &arch.Stack{t.Arch(), mm, sp}
+ st := &arch.Stack{
+ Arch: t.Arch(),
+ IO: mm,
+ Bottom: sp,
+ }
mask := t.signalMask
if t.haveSavedSignalMask {
mask = t.savedSignalMask
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2dbf86547..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -287,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
- if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -323,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
type runVsyscallAfterPtraceEventSeccomp struct {
addr usermem.Addr
sysno uintptr
- caller interface{}
+ caller marshal.Marshallable
}
func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -346,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 0cb86e390..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
}
}
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
- return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyInBytes is a fast version of CopyIn if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
})
}
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
- return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
var v []string
for {
argAddr := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, argAddr); err != nil {
+ if _, err := argAddr.CopyIn(t, addr); err != nil {
return v, err
}
if t.Arch().Value(argAddr) == 0 {
@@ -302,29 +281,29 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
}, nil
}
-// CopyContextWithOpts wraps a task to allow copying memory to and from the
-// task memory with user specified usermem.IOOpts.
-type CopyContextWithOpts struct {
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
*Task
opts usermem.IOOpts
}
-// AsCopyContextWithOpts wraps the task and returns it as CopyContextWithOpts.
-func (t *Task) AsCopyContextWithOpts(opts usermem.IOOpts) *CopyContextWithOpts {
- return &CopyContextWithOpts{t, opts}
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+ return &copyContext{t, opts}
}
// CopyInString copies a string in from the task's memory.
-func (t *CopyContextWithOpts) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
}
// CopyInBytes copies task memory into dst from an IO context.
-func (t *CopyContextWithOpts) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
}
// CopyOutBytes copies src into task memoryfrom an IO context.
-func (t *CopyContextWithOpts) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
return t.MemoryManager().CopyOut(t, addr, src, t.opts)
}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 872e1a82d..5ae5906e8 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
const TasksLimit = (1 << 16)
// ThreadID is a generic thread identifier.
+//
+// +marshal
type ThreadID int32
// String returns a decimal representation of the ThreadID.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 15c88aa7c..c69b62db9 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -122,7 +122,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
if err != nil {
return nil, err
}
- return &arch.Stack{a, m, ar.End}, nil
+ return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil
}
const (
@@ -247,20 +247,20 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
}
// Push the original filename to the stack, for AT_EXECFN.
- execfn, err := stack.Push(args.Filename)
- if err != nil {
+ if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
}
+ execfn := stack.Bottom
// Push 16 random bytes on the stack which AT_RANDOM will point to.
var b [16]byte
if _, err := rand.Read(b[:]); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
}
- random, err := stack.Push(b)
- if err != nil {
+ if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
}
+ random := stack.Bottom
c := auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3e85964e4..8c9f11cce 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -242,7 +242,7 @@ type MemoryManager struct {
// +stateify savable
type vma struct {
// mappable is the virtual memory object mapped by this vma. If mappable is
- // nil, the vma represents a private anonymous mapping.
+ // nil, the vma represents an anonymous mapping.
mappable memmap.Mappable
// off is the offset into mappable at which this vma begins. If mappable is
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index fdc308542..acac3d357 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) {
defer mm.DecUsers(ctx)
addr, err := mm.MMap(ctx, memmap.MMapOpts{
- Length: 2 * usermem.PageSize,
+ Length: 2 * usermem.PageSize,
+ Private: true,
})
if err != nil {
t.Fatalf("MMap got err %v want nil", err)
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index f4c93baeb..2dbe5b751 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -136,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 {
// NewSharedAnonMappable returns a SpecialMappable that implements the
// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
//
-// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
-// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
-// do the same to get non-zero device and inode IDs.
+// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
+// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
+// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
+// and causing memory for shared anonymous mappings to be allocated up-front
+// instead of on first touch; this is to avoid exacerbating the fs.MountSource
+// leak (b/143656263). Delete this function along with VFS1.
func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
if length == 0 {
return nil, syserror.EINVAL
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 4c9a575e7..a2555ba1a 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -24,7 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -93,18 +92,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
}
} else {
opts.Offset = 0
- if !opts.Private {
- if opts.MappingIdentity != nil {
- return 0, syserror.EINVAL
- }
- m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
- if err != nil {
- return 0, err
- }
- defer m.DecRef(ctx)
- opts.MappingIdentity = m
- opts.Mappable = m
- }
}
if opts.Addr.RoundDown() != opts.Addr {
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 3970dd81d..323837fb1 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -9,12 +9,12 @@ go_library(
"bluepill.go",
"bluepill_allocator.go",
"bluepill_amd64.go",
- "bluepill_amd64.s",
"bluepill_amd64_unsafe.go",
"bluepill_arm64.go",
"bluepill_arm64.s",
"bluepill_arm64_unsafe.go",
"bluepill_fault.go",
+ "bluepill_impl_amd64.s",
"bluepill_unsafe.go",
"context.go",
"filters_amd64.go",
@@ -81,3 +81,11 @@ go_test(
"//pkg/usermem",
],
)
+
+genrule(
+ name = "bluepill_impl_amd64",
+ srcs = ["bluepill_amd64.s"],
+ outs = ["bluepill_impl_amd64.s"],
+ cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+ tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 2bc34a435..025ea93b5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -19,11 +19,6 @@
// This is guaranteed to be zero.
#define VCPU_CPU 0x0
-// CPU_SELF is the self reference in ring0's percpu.
-//
-// This is guaranteed to be zero.
-#define CPU_SELF 0x0
-
// Context offsets.
//
// Only limited use of the context is done in the assembly stub below, most is
@@ -44,7 +39,7 @@ begin:
LEAQ VCPU_CPU(AX), BX
BYTE CLI;
check_vcpu:
- MOVQ CPU_SELF(GS), CX
+ MOVQ ENTRY_CPU_SELF(GS), CX
CMPQ BX, CX
JE right_vCPU
wrong_vcpu:
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ae813e24e..d46946402 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -156,15 +156,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
// Allocate page tables and install system mappings.
pageTables := pagetables.New(newAllocator())
- applyPhysicalRegions(func(pr physicalRegion) bool {
- // Map the kernel in the upper half.
- pageTables.Map(
- usermem.Addr(ring0.KernelStartAddress|pr.virtual),
- pr.length,
- pagetables.MapOpts{AccessType: usermem.AnyAccess},
- pr.physical)
- return true // Keep iterating.
- })
+ k.machine.mapUpperHalf(pageTables)
// Return the new address space.
return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 372a4cbd7..75da253c5 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -155,7 +155,7 @@ func (m *machine) newVCPU() *vCPU {
fd: int(fd),
machine: m,
}
- c.CPU.Init(&m.kernel, c)
+ c.CPU.Init(&m.kernel, c.id, c)
m.vCPUsByID[c.id] = c
// Ensure the signal mask is correct.
@@ -183,9 +183,6 @@ func newMachine(vm int) (*machine, error) {
// Create the machine.
m := &machine{fd: vm}
m.available.L = &m.mu
- m.kernel.Init(ring0.KernelOpts{
- PageTables: pagetables.New(newAllocator()),
- })
// Pull the maximum vCPUs.
maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
@@ -197,6 +194,9 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+ m.kernel.Init(ring0.KernelOpts{
+ PageTables: pagetables.New(newAllocator()),
+ }, m.maxVCPUs)
// Pull the maximum slots.
maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
@@ -219,15 +219,9 @@ func newMachine(vm int) (*machine, error) {
pagetables.MapOpts{AccessType: usermem.AnyAccess},
pr.physical)
- // And keep everything in the upper half.
- m.kernel.PageTables.Map(
- usermem.Addr(ring0.KernelStartAddress|pr.virtual),
- pr.length,
- pagetables.MapOpts{AccessType: usermem.AnyAccess},
- pr.physical)
-
return true // Keep iterating.
})
+ m.mapUpperHalf(m.kernel.PageTables)
var physicalRegionsReadOnly []physicalRegion
var physicalRegionsAvailable []physicalRegion
@@ -365,6 +359,11 @@ func (m *machine) Destroy() {
// Get gets an available vCPU.
//
// This will return with the OS thread locked.
+//
+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
+// to the vCPU in which the OS thread TID is running. So if Get() returns with
+// the corrent context in guest, the vCPU of it must be the same as what
+// Get() returns.
func (m *machine) Get() *vCPU {
m.mu.RLock()
runtime.LockOSThread()
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index acc823ba6..6849ab113 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -144,6 +144,7 @@ func (c *vCPU) initArchState() error {
// Set the entrypoint for the kernel.
kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+ kernelUserRegs.RSP = c.StackTop()
kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
// Set the system registers.
@@ -345,3 +346,43 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return physicalRegions
}
+
+var execRegions []region
+
+func init() {
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
+ return
+ }
+
+ if vr.accessType.Execute {
+ execRegions = append(execRegions, vr.region)
+ }
+ })
+}
+
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ for _, r := range execRegions {
+ physical, length, ok := translateToPhysical(r.virtual)
+ if !ok || length < r.length {
+ panic("impossilbe translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|r.virtual),
+ r.length,
+ pagetables.MapOpts{AccessType: usermem.Execute},
+ physical)
+ }
+ for start, end := range m.kernel.EntryRegions() {
+ regionLen := end - start
+ physical, length, ok := translateToPhysical(start)
+ if !ok || length < regionLen {
+ panic("impossible translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|start),
+ regionLen,
+ pagetables.MapOpts{AccessType: usermem.ReadWrite},
+ physical)
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 9db171af9..2df762991 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -19,6 +19,7 @@ package kvm
import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -48,6 +49,18 @@ const (
poolPCIDs = 8
)
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+ pr.length,
+ pagetables.MapOpts{AccessType: usermem.AnyAccess},
+ pr.physical)
+
+ return true // Keep iterating.
+ })
+}
+
// Get all read-only physicalRegions.
func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
var rdonlyRegions []region
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 905712076..537419657 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -79,7 +79,7 @@ func (c *vCPU) initArchState() error {
}
// tcr_el1
- data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+ data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS | _TCR_A1
reg.id = _KVM_ARM64_REGS_TCR_EL1
if err := c.setOneRegister(&reg); err != nil {
return err
@@ -103,7 +103,7 @@ func (c *vCPU) initArchState() error {
c.SetTtbr0Kvm(uintptr(data))
// ttbr1_el1
- data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+ data = c.machine.kernel.PageTables.TTBR1_EL1(false, 1)
reg.id = _KVM_ARM64_REGS_TTBR1_EL1
if err := c.setOneRegister(&reg); err != nil {
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e04165fbf..fc43cc3c0 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -30,7 +30,6 @@ go_library(
"//pkg/safecopy",
"//pkg/seccomp",
"//pkg/sentry/arch",
- "//pkg/sentry/hostcpu",
"//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go
index 1e07cfd0d..b0970e356 100644
--- a/pkg/sentry/platform/ptrace/filters.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -24,10 +24,9 @@ import (
// SyscallFilters returns syscalls made exclusively by the ptrace platform.
func (*PTrace) SyscallFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
- unix.SYS_GETCPU: {},
- unix.SYS_SCHED_SETAFFINITY: {},
- syscall.SYS_PTRACE: {},
- syscall.SYS_TGKILL: {},
- syscall.SYS_WAIT4: {},
+ unix.SYS_GETCPU: {},
+ syscall.SYS_PTRACE: {},
+ syscall.SYS_TGKILL: {},
+ syscall.SYS_WAIT4: {},
}
}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index e1d54d8a2..812ab80ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -518,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
defer c.interrupt.Disable()
- // Ensure that the CPU set is bound appropriately; this makes the
- // emulation below several times faster, presumably by avoiding
- // interprocessor wakeups and by simplifying the schedule.
- t.bind()
-
// Set registers.
if err := t.setRegs(regs); err != nil {
panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 84b699f0d..020bbda79 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -201,7 +201,7 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi
seccomp.RuleSet{
Rules: seccomp.SyscallRules{
syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+ {seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)},
},
},
Action: linux.SECCOMP_RET_ALLOW,
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2ce528601..8548853da 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -80,9 +80,9 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
Rules: seccomp.SyscallRules{
syscall.SYS_CLONE: []seccomp.Rule{
// Allow creation of new subprocesses (used by the master).
- {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+ {seccomp.EqualTo(syscall.CLONE_FILES | syscall.SIGKILL)},
// Allow creation of new threads within a single address space (used by addresss spaces).
- {seccomp.AllowValue(
+ {seccomp.EqualTo(
syscall.CLONE_FILES |
syscall.CLONE_FS |
syscall.CLONE_SIGHAND |
@@ -97,14 +97,14 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
// For the stub prctl dance (all).
syscall.SYS_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+ {seccomp.EqualTo(syscall.PR_SET_PDEATHSIG), seccomp.EqualTo(syscall.SIGKILL)},
},
syscall.SYS_GETPPID: {},
// For the stub to stop itself (all).
syscall.SYS_GETPID: {},
syscall.SYS_KILL: []seccomp.Rule{
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SIGSTOP)},
},
// Injected to support the address space operations.
@@ -115,7 +115,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
})
}
rules = appendArchSeccompRules(rules, defaultAction)
- instrs, err := seccomp.BuildProgram(rules, defaultAction)
+ instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 245b20722..533e45497 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,29 +18,12 @@
package ptrace
import (
- "sync/atomic"
"syscall"
"unsafe"
- "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/hostcpu"
- "gvisor.dev/gvisor/pkg/sync"
)
-// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
-// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
-// just records the number of CPUs available in the scheduler affinity set at
-// startup. This may a) change over time and b) gives a number far lower than
-// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
-// use a pool to store large masks that we can reuse during bind.
-var maskPool = sync.Pool{
- New: func() interface{} {
- const maxCPUs = 1024 // Not a hard limit; see below.
- return make([]uintptr, maxCPUs/64)
- },
-}
-
// unmaskAllSignals unmasks all signals on the current thread.
//
//go:nosplit
@@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno {
_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
return errno
}
-
-// setCPU sets the CPU affinity.
-func (t *thread) setCPU(cpu uint32) error {
- mask := maskPool.Get().([]uintptr)
- n := int(cpu / 64)
- v := uintptr(1 << uintptr(cpu%64))
- if n >= len(mask) {
- // See maskPool note above. We've actually exceeded the number
- // of available cores. Grow the mask and return it.
- mask = make([]uintptr, n+1)
- }
- mask[n] |= v
- if _, _, errno := syscall.RawSyscall(
- unix.SYS_SCHED_SETAFFINITY,
- uintptr(t.tid),
- uintptr(len(mask)*8),
- uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
- return errno
- }
- mask[n] &^= v
- maskPool.Put(mask)
- return nil
-}
-
-// bind attempts to ensure that the thread is on the same CPU as the current
-// thread. This provides no guarantees as it is fundamentally a racy operation:
-// CPU sets may change and we may be rescheduled in the middle of this
-// operation. As a result, no failures are reported.
-//
-// Precondition: the current runtime thread should be locked.
-func (t *thread) bind() {
- currentCPU := hostcpu.GetCPU()
-
- if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
- // Set the affinity on the thread and save the CPU for next
- // round; we don't expect CPUs to bounce around too frequently.
- //
- // (It's worth noting that we could move CPUs between this point
- // and when the tracee finishes executing. But that would be
- // roughly the status quo anyways -- we're just maximizing our
- // chances of colocation, not guaranteeing it.)
- t.setCPU(currentCPU)
- }
-}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 9c6c2cf5c..f617519fa 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -76,15 +76,42 @@ type KernelOpts struct {
type KernelArchState struct {
KernelOpts
+ // cpuEntries is array of kernelEntry for all cpus
+ cpuEntries []kernelEntry
+
// globalIDT is our set of interrupt gates.
- globalIDT idt64
+ globalIDT *idt64
}
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
+// kernelEntry contains minimal CPU-specific arch state
+// that can be mapped at the upper of the address space.
+// Malicious APP might steal info from it via CPU bugs.
+type kernelEntry struct {
// stack is the stack used for interrupts on this CPU.
stack [256]byte
+ // scratch space for temporary usage.
+ scratch0 uint64
+ scratch1 uint64
+
+ // stackTop is the top of the stack.
+ stackTop uint64
+
+ // cpuSelf is back reference to CPU.
+ cpuSelf *CPU
+
+ // kernelCR3 is the cr3 used for sentry kernel.
+ kernelCR3 uintptr
+
+ // gdt is the CPU's descriptor table.
+ gdt descriptorTable
+
+ // tss is the CPU's task state.
+ tss TaskState64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
// errorCode is the error code from the last exception.
errorCode uintptr
@@ -97,11 +124,7 @@ type CPUArchState struct {
// exception.
errorType uintptr
- // gdt is the CPU's descriptor table.
- gdt descriptorTable
-
- // tss is the CPU's task state.
- tss TaskState64
+ *kernelEntry
}
// ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 0e2ab716c..508236e46 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -77,6 +77,9 @@ type CPUArchState struct {
// lazyVFP is the value of cpacr_el1.
lazyVFP uintptr
+
+ // appASID is the asid value of guest application.
+ appASID uintptr
}
// ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 7fa43c2f5..d87b1fd00 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -36,12 +36,15 @@ func sysenter()
// This must be called prior to sysret/iret.
func swapgs()
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
// sysret returns to userspace from a system call.
//
// The return code is the vector that interrupted execution.
//
// See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *arch.Registers) Vector
+func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
// "iret is the cadillac of CPL switching."
//
@@ -50,7 +53,7 @@ func sysret(*CPU, *arch.Registers) Vector
// iret is nearly identical to sysret, except an iret is used to fully restore
// all user state. This must be called in cases where all registers need to be
// restored.
-func iret(*CPU, *arch.Registers) Vector
+func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
// exception is the generic exception entry.
//
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 02df38331..82689a194 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -63,6 +63,15 @@
MOVQ offset+PTRACE_RSI(reg), SI; \
MOVQ offset+PTRACE_RDI(reg), DI;
+// WRITE_CR3() writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// mov %rax, %cr3
+//
+#define WRITE_CR3() \
+ BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+
// SWAP_GS swaps the kernel GS (CPU).
#define SWAP_GS() \
BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
@@ -75,15 +84,9 @@
#define SYSRET64() \
BYTE $0x48; BYTE $0x0f; BYTE $0x07;
-// LOAD_KERNEL_ADDRESS loads a kernel address.
-#define LOAD_KERNEL_ADDRESS(from, to) \
- MOVQ from, to; \
- ORQ ·KernelStartAddress(SB), to;
-
// LOAD_KERNEL_STACK loads the kernel stack.
-#define LOAD_KERNEL_STACK(from) \
- LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
- LEAQ CPU_STACK_TOP(SP), SP;
+#define LOAD_KERNEL_STACK(entry) \
+ MOVQ ENTRY_STACK_TOP(entry), SP;
// See kernel.go.
TEXT ·Halt(SB),NOSPLIT,$0
@@ -95,58 +98,93 @@ TEXT ·swapgs(SB),NOSPLIT,$0
SWAP_GS()
RET
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+ MOVQ 0(SP), AX
+ ORQ ·KernelStartAddress(SB), AX // Future return value.
+ MOVQ AX, 0(SP)
+ RET
+
// See entry_amd64.go.
TEXT ·sysret(SB),NOSPLIT,$0-24
- // Save original state.
- LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
- LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ CALL ·jumpToKernel(SB)
+ // Save original state and stack. sysenter() or exception()
+ // from APP(gr3) will switch to this stack, set the return
+ // value (vector: 32(SP)) and then do RET, which will also
+ // automatically return to the lower half.
+ MOVQ cpu+0(FP), BX
+ MOVQ regs+8(FP), AX
+ MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+ // save SP AX userCR3 on the kernel stack.
+ MOVQ CPU_ENTRY(BX), BX
+ LOAD_KERNEL_STACK(BX)
+ PUSHQ PTRACE_RSP(AX)
+ PUSHQ PTRACE_RAX(AX)
+ PUSHQ CX
+
// Restore user register state.
REGISTERS_LOAD(AX, 0)
MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET.
MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
- MOVQ PTRACE_RSP(AX), SP // Restore the stack directly.
- MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+
+ // restore userCR3, AX, SP.
+ POPQ AX // Get userCR3.
+ WRITE_CR3() // Switch to userCR3.
+ POPQ AX // Restore AX.
+ POPQ SP // Restore SP.
SYSRET64()
// See entry_amd64.go.
TEXT ·iret(SB),NOSPLIT,$0-24
- // Save original state.
- LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
- LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ CALL ·jumpToKernel(SB)
+ // Save original state and stack. sysenter() or exception()
+ // from APP(gr3) will switch to this stack, set the return
+ // value (vector: 32(SP)) and then do RET, which will also
+ // automatically return to the lower half.
+ MOVQ cpu+0(FP), BX
+ MOVQ regs+8(FP), AX
+ MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
// Build an IRET frame & restore state.
+ MOVQ CPU_ENTRY(BX), BX
LOAD_KERNEL_STACK(BX)
- MOVQ PTRACE_SS(AX), BX; PUSHQ BX
- MOVQ PTRACE_RSP(AX), CX; PUSHQ CX
- MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
- MOVQ PTRACE_CS(AX), DI; PUSHQ DI
- MOVQ PTRACE_RIP(AX), SI; PUSHQ SI
- REGISTERS_LOAD(AX, 0) // Restore most registers.
- MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+ PUSHQ PTRACE_SS(AX)
+ PUSHQ PTRACE_RSP(AX)
+ PUSHQ PTRACE_FLAGS(AX)
+ PUSHQ PTRACE_CS(AX)
+ PUSHQ PTRACE_RIP(AX)
+ PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack.
+ PUSHQ CX // Save userCR3 on kernel stack.
+ REGISTERS_LOAD(AX, 0) // Restore most registers.
+ POPQ AX // Get userCR3.
+ WRITE_CR3() // Switch to userCR3.
+ POPQ AX // Restore AX.
IRET()
// See entry_amd64.go.
TEXT ·resume(SB),NOSPLIT,$0
// See iret, above.
- MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX
- MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
- MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI
- MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI
- REGISTERS_LOAD(GS, CPU_REGISTERS)
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
+ REGISTERS_LOAD(AX, CPU_REGISTERS)
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
IRET()
// See entry_amd64.go.
TEXT ·Start(SB),NOSPLIT,$0
- LOAD_KERNEL_STACK(AX) // Set the stack.
PUSHQ $0x0 // Previous frame pointer.
MOVQ SP, BP // Set frame pointer.
PUSHQ AX // First argument (CPU).
@@ -164,44 +202,54 @@ TEXT ·sysenter(SB),NOSPLIT,$0
user:
SWAP_GS()
- XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
- XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+ MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch.
+ MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
+ WRITE_CR3() // Switch to kernel cr3.
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value.
- MOVQ BX, PTRACE_RAX(AX) // Save everything else.
- MOVQ BX, PTRACE_ORIGRAX(AX)
MOVQ CX, PTRACE_RIP(AX)
MOVQ R11, PTRACE_FLAGS(AX)
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
- MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
- MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+ MOVQ SP, PTRACE_RSP(AX)
+ MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value.
+ MOVQ CX, PTRACE_RAX(AX) // Save everything else.
+ MOVQ CX, PTRACE_ORIGRAX(AX)
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks.
+ MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
+ MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
// Return to the kernel, where the frame is:
//
- // vector (sp+24)
+ // vector (sp+32)
+ // userCR3 (sp+24)
// regs (sp+16)
// cpu (sp+8)
// vcpu.Switch (sp+0)
//
- MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
- MOVQ $Syscall, 24(SP) // Output vector.
+ MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+ MOVQ $Syscall, 32(SP) // Output vector.
RET
kernel:
// We can't restore the original stack, but we can access the registers
// in the CPU state directly. No need for temporary juggling.
- MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
- MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
- REGISTERS_SAVE(GS, CPU_REGISTERS)
- MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS)
- MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
- MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS)
- MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
- MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+ MOVQ AX, ENTRY_SCRATCH0(GS)
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ REGISTERS_SAVE(AX, CPU_REGISTERS)
+ MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX)
+ MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
+ MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX)
+ MOVQ ENTRY_SCRATCH0(GS), BX
+ MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+ MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+ MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
+ MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
// Call the syscall trampoline.
LOAD_KERNEL_STACK(GS)
- MOVQ CPU_SELF(GS), AX // Load vCPU.
PUSHQ AX // First argument (vCPU).
CALL ·kernelSyscall(SB) // Call the trampoline.
POPQ AX // Pop vCPU.
@@ -237,9 +285,14 @@ user:
SWAP_GS()
ADDQ $-8, SP // Adjust for flags.
MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
- XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for user regs.
+ PUSHQ AX // Save user AX on stack.
+ MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
+ WRITE_CR3() // Switch to kernel cr3.
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Restore original AX.
+ POPQ BX // Restore original AX.
MOVQ BX, PTRACE_RAX(AX) // Save it.
MOVQ BX, PTRACE_ORIGRAX(AX)
MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
@@ -249,34 +302,36 @@ user:
MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
// Copy out and return.
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ 0(SP), BX // Load vector.
MOVQ 8(SP), CX // Load error code.
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
- MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
- MOVQ CX, CPU_ERROR_CODE(GS) // Set error code.
- MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
- MOVQ BX, 24(SP) // Output vector.
+ MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
+ MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+ MOVQ CX, CPU_ERROR_CODE(AX) // Set error code.
+ MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
+ MOVQ BX, 32(SP) // Output vector.
RET
kernel:
// As per above, we can save directly.
- MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
- MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
- REGISTERS_SAVE(GS, CPU_REGISTERS)
- MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
- MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
- MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+ PUSHQ AX
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ REGISTERS_SAVE(AX, CPU_REGISTERS)
+ POPQ BX
+ MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+ MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+ MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
+ MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
+ MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
// Set the error code and adjust the stack.
- MOVQ 8(SP), AX // Load the error code.
- MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
- MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+ MOVQ 8(SP), BX // Load the error code.
+ MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
+ MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
MOVQ 0(SP), BX // BX contains the vector.
- ADDQ $48, SP // Drop the exception frame.
// Call the exception trampoline.
LOAD_KERNEL_STACK(GS)
- MOVQ CPU_SELF(GS), AX // Load vCPU.
PUSHQ BX // Second argument (vector).
PUSHQ AX // First argument (vCPU).
CALL ·kernelException(SB) // Call the trampoline.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 9d29b7168..5f63cbd45 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -27,7 +27,9 @@
// ERET returns using the ELR and SPSR for the current exception level.
#define ERET() \
- WORD $0xd69f03e0
+ WORD $0xd69f03e0; \
+ DSB $7; \
+ ISB $15;
// RSV_REG is a register that holds el1 information temporarily.
#define RSV_REG R18_PLATFORM
@@ -300,17 +302,23 @@
// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
#define SWITCH_TO_APP_PAGETABLE(from) \
- MOVD CPU_TTBR0_APP(from), RSV_REG; \
- WORD $0xd5182012; \ // MSR R18, TTBR0_EL1
+ MRS TTBR1_EL1, R0; \
+ MOVD CPU_APP_ASID(from), R1; \
+ BFI $48, R1, $16, R0; \
+ MSR R0, TTBR1_EL1; \ // set the ASID in TTBR1_EL1 (since TCR.A1 is set)
ISB $15; \
- DSB $15;
+ MOVD CPU_TTBR0_APP(from), RSV_REG; \
+ MSR RSV_REG, TTBR0_EL1;
// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
#define SWITCH_TO_KVM_PAGETABLE(from) \
- MOVD CPU_TTBR0_KVM(from), RSV_REG; \
- WORD $0xd5182012; \ // MSR R18, TTBR0_EL1
+ MRS TTBR1_EL1, R0; \
+ MOVD $1, R1; \
+ BFI $48, R1, $16, R0; \
+ MSR R0, TTBR1_EL1; \
ISB $15; \
- DSB $15;
+ MOVD CPU_TTBR0_KVM(from), RSV_REG; \
+ MSR RSV_REG, TTBR0_EL1;
#define VFP_ENABLE \
MOVD $FPEN_ENABLE, R0; \
@@ -326,23 +334,20 @@
#define KERNEL_ENTRY_FROM_EL0 \
SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack.
STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18, step2, switch user pagetable.
- SWITCH_TO_KVM_PAGETABLE(RSV_REG); \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
- MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer.
- REGISTERS_SAVE(RSV_REG_APP, 0); \ // step4, save app context.
+ WORD $0xd538d092; \ // MRS TPIDR_EL1, R18
+ MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer.
+ REGISTERS_SAVE(RSV_REG_APP, 0); \ // step3, save app context.
MOVD RSV_REG_APP, R20; \
LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \
ADD $16, RSP, RSP; \
MOVD RSV_REG, PTRACE_R18(R20); \
MOVD RSV_REG_APP, PTRACE_R9(R20); \
- MOVD R20, RSV_REG_APP; \
WORD $0xd5384003; \ // MRS SPSR_EL1, R3
- MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \
+ MOVD R3, PTRACE_PSTATE(R20); \
MRS ELR_EL1, R3; \
- MOVD R3, PTRACE_PC(RSV_REG_APP); \
+ MOVD R3, PTRACE_PC(R20); \
WORD $0xd5384103; \ // MRS SP_EL0, R3
- MOVD R3, PTRACE_SP(RSV_REG_APP);
+ MOVD R3, PTRACE_SP(R20);
// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
#define KERNEL_ENTRY_FROM_EL1 \
@@ -357,6 +362,13 @@
MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack.
+// storeAppASID writes the application's asid value.
+TEXT ·storeAppASID(SB),NOSPLIT,$0-8
+ MOVD asid+0(FP), R1
+ MRS TPIDR_EL1, RSV_REG
+ MOVD R1, CPU_APP_ASID(RSV_REG)
+ RET
+
// Halt halts execution.
TEXT ·Halt(SB),NOSPLIT,$0
// Clear bluepill.
@@ -414,7 +426,7 @@ TEXT ·Current(SB),NOSPLIT,$0-8
MOVD R8, ret+0(FP)
RET
-#define STACK_FRAME_SIZE 16
+#define STACK_FRAME_SIZE 32
// kernelExitToEl0 is the entrypoint for application in guest_el0.
// Prepare the vcpu environment for container application.
@@ -458,15 +470,16 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
SUB $STACK_FRAME_SIZE, RSP, RSP
STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+ STP (R0, R1), 16*1(RSP)
WORD $0xd538d092 //MRS TPIDR_EL1, R18
SWITCH_TO_APP_PAGETABLE(RSV_REG)
+ LDP 16*1(RSP), (R0, R1)
LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
ADD $STACK_FRAME_SIZE, RSP, RSP
- ISB $15
ERET()
// kernelExitToEl1 is the entrypoint for sentry in guest_el1.
@@ -482,6 +495,9 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
MOVD R1, RSP
+ SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+ MRS TPIDR_EL1, RSV_REG
+
REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 549f3d228..9742308d8 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -24,7 +24,10 @@ go_binary(
"defs_impl_arm64.go",
"main.go",
],
- visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+ visibility = [
+ "//pkg/sentry/platform/kvm:__pkg__",
+ "//pkg/sentry/platform/ring0:__pkg__",
+ ],
deps = [
"//pkg/cpuid",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 021693791..264be23d3 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -19,8 +19,8 @@ package ring0
// N.B. that constraints on KernelOpts must be satisfied.
//
//go:nosplit
-func (k *Kernel) Init(opts KernelOpts) {
- k.init(opts)
+func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
+ k.init(opts, maxCPUs)
}
// Halt halts execution.
@@ -49,6 +49,11 @@ func (defaultHooks) KernelException(Vector) {
// kernelSyscall is a trampoline.
//
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
// +checkescape:hard,stack
//
//go:nosplit
@@ -58,6 +63,11 @@ func kernelSyscall(c *CPU) {
// kernelException is a trampoline.
//
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
// +checkescape:hard,stack
//
//go:nosplit
@@ -68,10 +78,10 @@ func kernelException(c *CPU, vector Vector) {
// Init initializes a new CPU.
//
// Init allows embedding in other objects.
-func (c *CPU) Init(k *Kernel, hooks Hooks) {
- c.self = c // Set self reference.
- c.kernel = k // Set kernel reference.
- c.init() // Perform architectural init.
+func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) {
+ c.self = c // Set self reference.
+ c.kernel = k // Set kernel reference.
+ c.init(cpuID) // Perform architectural init.
// Require hooks.
if hooks != nil {
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index d37981dbf..3a9dff4cc 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -18,13 +18,42 @@ package ring0
import (
"encoding/binary"
+ "reflect"
+
+ "gvisor.dev/gvisor/pkg/usermem"
)
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables
+ entrySize := reflect.TypeOf(kernelEntry{}).Size()
+ var (
+ entries []kernelEntry
+ padding = 1
+ )
+ for {
+ entries = make([]kernelEntry, maxCPUs+padding-1)
+ totalSize := entrySize * uintptr(maxCPUs+padding-1)
+ addr := reflect.ValueOf(&entries[0]).Pointer()
+ if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize {
+ // The runtime forces power-of-2 alignment for allocations, and we are therefore
+ // safe once the first address is aligned and the chunk is at least a full page.
+ break
+ }
+ padding = padding << 1
+ }
+ k.cpuEntries = entries
+
+ k.globalIDT = &idt64{}
+ if reflect.TypeOf(idt64{}).Size() != usermem.PageSize {
+ panic("Size of globalIDT should be PageSize")
+ }
+ if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 {
+ panic("Allocated globalIDT should be page aligned")
+ }
+
// Setup the IDT, which is uniform.
for v, handler := range handlers {
// Allow Breakpoint and Overflow to be called from all
@@ -39,8 +68,26 @@ func (k *Kernel) init(opts KernelOpts) {
}
}
+func (k *Kernel) EntryRegions() map[uintptr]uintptr {
+ regions := make(map[uintptr]uintptr)
+
+ addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
+ size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
+ end, _ := usermem.Addr(addr + size).RoundUp()
+ regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+ addr = reflect.ValueOf(k.globalIDT).Pointer()
+ size = reflect.TypeOf(idt64{}).Size()
+ end, _ = usermem.Addr(addr + size).RoundUp()
+ regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+ return regions
+}
+
// init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
+ c.kernelEntry = &c.kernel.cpuEntries[cpuID]
+ c.cpuSelf = c
// Null segment.
c.gdt[0].setNull()
@@ -65,6 +112,7 @@ func (c *CPU) init() {
// Set the kernel stack pointer in the TSS (virtual address).
stackAddr := c.StackTop()
+ c.stackTop = stackAddr
c.tss.rsp0Lo = uint32(stackAddr)
c.tss.rsp0Hi = uint32(stackAddr >> 32)
c.tss.ist1Lo = uint32(stackAddr)
@@ -183,7 +231,7 @@ func IsCanonical(addr uint64) bool {
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
- kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+ c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
// Sanitize registers.
regs := switchOpts.Registers
@@ -197,15 +245,11 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
- jumpToKernel() // Switch to upper half.
- writeCR3(uintptr(userCR3)) // Change to user address space.
if switchOpts.FullRestore {
- vector = iret(c, regs)
+ vector = iret(c, regs, uintptr(userCR3))
} else {
- vector = sysret(c, regs)
+ vector = sysret(c, regs, uintptr(userCR3))
}
- writeCR3(uintptr(kernelCR3)) // Return to kernel address space.
- jumpToUser() // Return to lower half.
SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
return
@@ -219,7 +263,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
//go:nosplit
func start(c *CPU) {
// Save per-cpu & FS segment.
- WriteGS(kernelAddr(c))
+ WriteGS(kernelAddr(c.kernelEntry))
WriteFS(uintptr(c.registers.Fs_base))
// Initialize floating point.
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index d0afa1aaa..0ca98a7c7 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,13 +25,13 @@ func HaltAndResume()
func HaltEl1SvcAndResume()
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables
}
// init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
// Set the kernel stack pointer(virtual address).
c.registers.Sp = uint64(c.StackTop())
@@ -53,6 +53,11 @@ func IsCanonical(addr uint64) bool {
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+ storeAppASID(uintptr(switchOpts.UserASID))
+ if switchOpts.Flush {
+ FlushTlbAll()
+ }
+
regs := switchOpts.Registers
regs.Pstate &= ^uint64(PsrFlagsClear)
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index ca968a036..0ec5c3bc5 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -61,21 +61,9 @@ func wrgsbase(addr uintptr)
// wrgsmsr writes to the GS_BASE MSR.
func wrgsmsr(addr uintptr)
-// writeCR3 writes the CR3 value.
-func writeCR3(phys uintptr)
-
-// readCR3 reads the current CR3 value.
-func readCR3() uintptr
-
// readCR2 reads the current CR2 value.
func readCR2() uintptr
-// jumpToKernel jumps to the kernel version of the current RIP.
-func jumpToKernel()
-
-// jumpToUser jumps to the user version of the current RIP.
-func jumpToUser()
-
// fninit initializes the floating point unit.
func fninit()
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 75d742750..2fe83568a 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -127,53 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
BYTE $0x0f; BYTE $0x30; // WRMSR
RET
-// jumpToUser changes execution to the user address.
-//
-// This works by changing the return value to the user version.
-TEXT ·jumpToUser(SB),NOSPLIT,$0
- MOVQ 0(SP), AX
- MOVQ ·KernelStartAddress(SB), BX
- NOTQ BX
- ANDQ BX, SP // Switch the stack.
- ANDQ BX, BP // Switch the frame pointer.
- ANDQ BX, AX // Future return value.
- MOVQ AX, 0(SP)
- RET
-
-// jumpToKernel changes execution to the kernel address space.
-//
-// This works by changing the return value to the kernel version.
-TEXT ·jumpToKernel(SB),NOSPLIT,$0
- MOVQ 0(SP), AX
- MOVQ ·KernelStartAddress(SB), BX
- ORQ BX, SP // Switch the stack.
- ORQ BX, BP // Switch the frame pointer.
- ORQ BX, AX // Future return value.
- MOVQ AX, 0(SP)
- RET
-
-// writeCR3 writes the given CR3 value.
-//
-// The code corresponds to:
-//
-// mov %rax, %cr3
-//
-TEXT ·writeCR3(SB),NOSPLIT,$0-8
- MOVQ cr3+0(FP), AX
- BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
- RET
-
-// readCR3 reads the current CR3 value.
-//
-// The code corresponds to:
-//
-// mov %cr3, %rax
-//
-TEXT ·readCR3(SB),NOSPLIT,$0-8
- BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
- MOVQ AX, ret+0(FP)
- RET
-
// readCR2 reads the current CR2 value.
//
// The code corresponds to:
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 00e52c8af..2f1abcb0f 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,6 +16,15 @@
package ring0
+// storeAppASID writes the application's asid value.
+func storeAppASID(asid uintptr)
+
+// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU.
+func LocalFlushTlbAll()
+
+// FlushTlbAll flush all tlb.
+func FlushTlbAll()
+
// CPACREL1 returns the value of the CPACR_EL1 register.
func CPACREL1() (value uintptr)
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 86bfbe46f..8aabf7d0e 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,6 +15,20 @@
#include "funcdata.h"
#include "textflag.h"
+TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0
+ DSB $6 // dsb(nshst)
+ WORD $0xd508871f // __tlbi(vmalle1)
+ DSB $7 // dsb(nsh)
+ ISB $15
+ RET
+
+TEXT ·FlushTlbAll(SB),NOSPLIT,$0
+ DSB $10 // dsb(ishst)
+ WORD $0xd508831f // __tlbi(vmalle1is)
+ DSB $11 // dsb(ish)
+ ISB $15
+ RET
+
TEXT ·GetTLS(SB),NOSPLIT,$0-8
MRS TPIDR_EL0, R1
MOVD R1, ret+0(FP)
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index b8ab120a0..bcb73cb31 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -30,11 +30,18 @@ func Emit(w io.Writer) {
c := &CPU{}
fmt.Fprintf(w, "\n// CPU offsets.\n")
- fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
+
+ e := &kernelEntry{}
+ fmt.Fprintf(w, "\n// CPU entry offsets.\n")
+ fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_SCRATCH1 0x%02x\n", reflect.ValueOf(&e.scratch1).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
fmt.Fprintf(w, "\n// Bits.\n")
fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index f3de962f0..1d86b4bcf 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -41,6 +41,7 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_APP_ASID 0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "\n// Bits.\n")
fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 9da0ea685..e99da0b35 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -104,7 +104,7 @@ const (
VirtualizationException
SecurityException = 0x1e
SyscallInt80 = 0x80
- _NR_INTERRUPTS = SyscallInt80 + 1
+ _NR_INTERRUPTS = 0x100
)
// System call vectors.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c0fd3425b..a3f775d15 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -10,6 +10,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/context",
+ "//pkg/marshal",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
@@ -20,6 +21,5 @@ go_library(
"//pkg/syserr",
"//pkg/tcpip",
"//pkg/usermem",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e76e498de..b6ebe29d6 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -21,6 +21,8 @@ go_library(
"//pkg/context",
"//pkg/fdnotifier",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
@@ -37,11 +39,12 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/syserr",
"//pkg/syserror",
+ "//pkg/tcpip",
+ "//pkg/tcpip/network/ipv4",
+ "//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
- "//tools/go_marshal/primitive",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 242e6bf76..7d3c4a01c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -24,6 +24,8 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -36,8 +38,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
const (
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 8a1d52ebf..163af329b 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -52,6 +52,7 @@ var _ = socket.SocketVFS2(&socketVFS2{})
func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
mnt := t.Kernel().SocketMount()
d := sockfs.NewDentry(t.Credentials(), mnt)
+ defer d.DecRef(t)
s := &socketVFS2{
socketOpsCommon: socketOpsCommon{
@@ -77,6 +78,13 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
return vfsfd, nil
}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *socketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
+}
+
// Readiness implements waiter.Waitable.Readiness.
func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
return s.socketOpsCommon.Readiness(mask)
@@ -97,11 +105,6 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
return ioctl(ctx, s.fd, uio, args)
}
-// Allocate implements vfs.FileDescriptionImpl.Allocate.
-func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
- return syserror.ENODEV
-}
-
// PRead implements vfs.FileDescriptionImpl.PRead.
func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.ESPIPE
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index fda3dcb35..faa61160e 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -30,6 +30,9 @@ import (
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -59,6 +62,8 @@ type Stack struct {
tcpSACKEnabled bool
netDevFile *os.File
netSNMPFile *os.File
+ ipv4Forwarding bool
+ ipv6Forwarding bool
}
// NewStack returns an empty Stack containing no configuration.
@@ -118,6 +123,13 @@ func (s *Stack) Configure() error {
s.netSNMPFile = f
}
+ s.ipv6Forwarding = false
+ if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil {
+ s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0"
+ } else {
+ log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false")
+ }
+
return nil
}
@@ -468,3 +480,21 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ switch protocol {
+ case ipv4.ProtocolNumber:
+ return s.ipv4Forwarding
+ case ipv6.ProtocolNumber:
+ return s.ipv6Forwarding
+ default:
+ log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol)
+ return false
+ }
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+ return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 0336a32d8..549787955 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -37,7 +39,7 @@ type matchMaker interface {
// name is the matcher name as stored in the xt_entry_match struct.
name() string
- // marshal converts from an stack.Matcher to an ABI struct.
+ // marshal converts from a stack.Matcher to an ABI struct.
marshal(matcher stack.Matcher) []byte
// unmarshal converts from the ABI matcher struct to an
@@ -93,3 +95,71 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf
}
return matchMaker.unmarshal(buf, filter)
}
+
+// targetMaker knows how to (un)marshal a target. Once registered,
+// marshalTarget and unmarshalTarget can be used.
+type targetMaker interface {
+ // id uniquely identifies the target.
+ id() stack.TargetID
+
+ // marshal converts from a stack.Target to an ABI struct.
+ marshal(target stack.Target) []byte
+
+ // unmarshal converts from the ABI matcher struct to a stack.Target.
+ unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error)
+}
+
+// targetMakers maps the TargetID of supported targets to the targetMaker that
+// marshals and unmarshals it. It is immutable after package initialization.
+var targetMakers = map[stack.TargetID]targetMaker{}
+
+func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
+ tid := stack.TargetID{
+ Name: name,
+ NetworkProtocol: netProto,
+ Revision: rev,
+ }
+ if _, ok := targetMakers[tid]; !ok {
+ return 0, false
+ }
+
+ // Return the highest supported revision unless rev is higher.
+ for _, other := range targetMakers {
+ otherID := other.id()
+ if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev {
+ rev = uint8(otherID.Revision)
+ }
+ }
+ return rev, true
+}
+
+// registerTargetMaker should be called by target extensions to register them
+// with the netfilter package.
+func registerTargetMaker(tm targetMaker) {
+ if _, ok := targetMakers[tm.id()]; ok {
+ panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id()))
+ }
+ targetMakers[tm.id()] = tm
+}
+
+func marshalTarget(target stack.Target) []byte {
+ targetMaker, ok := targetMakers[target.ID()]
+ if !ok {
+ panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID()))
+ }
+ return targetMaker.marshal(target)
+}
+
+func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) {
+ tid := stack.TargetID{
+ Name: target.Name.String(),
+ NetworkProtocol: filter.NetworkProtocol(),
+ Revision: target.Revision,
+ }
+ targetMaker, ok := targetMakers[tid]
+ if !ok {
+ nflog("unsupported target with name %q", target.Name.String())
+ return nil, syserr.ErrInvalidArgument
+ }
+ return targetMaker.unmarshal(buf, filter)
+}
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
index e4c55a100..b560fae0d 100644
--- a/pkg/sentry/socket/netfilter/ipv4.go
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -181,18 +181,23 @@ func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace,
nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
return nil, syserr.ErrInvalidArgument
}
- target, err := parseTarget(filter, optVal[:targetSize])
- if err != nil {
- nflog("failed to parse target: %v", err)
- return nil, syserr.ErrInvalidArgument
- }
- optVal = optVal[targetSize:]
- table.Rules = append(table.Rules, stack.Rule{
+ rule := stack.Rule{
Filter: filter,
- Target: target,
Matchers: matchers,
- })
+ }
+
+ {
+ target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */)
+ if err != nil {
+ nflog("failed to parse target: %v", err)
+ return nil, err
+ }
+ rule.Target = target
+ }
+ optVal = optVal[targetSize:]
+
+ table.Rules = append(table.Rules, rule)
offsets[offset] = int(entryIdx)
offset += uint32(entry.NextOffset)
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
index 3b2c1becd..4253f7bf4 100644
--- a/pkg/sentry/socket/netfilter/ipv6.go
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -184,18 +184,23 @@ func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace,
nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
return nil, syserr.ErrInvalidArgument
}
- target, err := parseTarget(filter, optVal[:targetSize])
- if err != nil {
- nflog("failed to parse target: %v", err)
- return nil, syserr.ErrInvalidArgument
- }
- optVal = optVal[targetSize:]
- table.Rules = append(table.Rules, stack.Rule{
+ rule := stack.Rule{
Filter: filter,
- Target: target,
Matchers: matchers,
- })
+ }
+
+ {
+ target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */)
+ if err != nil {
+ nflog("failed to parse target: %v", err)
+ return nil, err
+ }
+ rule.Target = target
+ }
+ optVal = optVal[targetSize:]
+
+ table.Rules = append(table.Rules, rule)
offsets[offset] = int(entryIdx)
offset += uint32(entry.NextOffset)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3e1735079..904a12e38 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -195,7 +196,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
// Check the user chains.
for ruleIdx, rule := range table.Rules {
- if _, ok := rule.Target.(stack.UserChainTarget); !ok {
+ if _, ok := rule.Target.(*stack.UserChainTarget); !ok {
continue
}
@@ -216,7 +217,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
// Set each jump to point to the appropriate rule. Right now they hold byte
// offsets.
for ruleIdx, rule := range table.Rules {
- jump, ok := rule.Target.(JumpTarget)
+ jump, ok := rule.Target.(*JumpTarget)
if !ok {
continue
}
@@ -307,7 +308,7 @@ func validUnderflow(rule stack.Rule, ipv6 bool) bool {
return false
}
switch rule.Target.(type) {
- case stack.AcceptTarget, stack.DropTarget:
+ case *stack.AcceptTarget, *stack.DropTarget:
return true
default:
return false
@@ -318,7 +319,7 @@ func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
if !validUnderflow(rule, ipv6) {
return false
}
- _, ok := rule.Target.(stack.AcceptTarget)
+ _, ok := rule.Target.(*stack.AcceptTarget)
return ok
}
@@ -337,3 +338,20 @@ func hookFromLinux(hook int) stack.Hook {
}
panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
}
+
+// TargetRevision returns a linux.XTGetRevision for a given target. It sets
+// Revision to the highest supported value, unless the provided revision number
+// is larger.
+func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) {
+ // Read in the target name and version.
+ var rev linux.XTGetRevision
+ if _, err := rev.CopyIn(t, revPtr); err != nil {
+ return linux.XTGetRevision{}, syserr.FromError(err)
+ }
+ maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision)
+ if !ok {
+ return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported
+ }
+ rev.Revision = maxSupported
+ return rev, nil
+}
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 87e41abd8..0e14447fe 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,255 +15,357 @@
package netfilter
import (
- "errors"
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
-// errorTargetName is used to mark targets as error targets. Error targets
-// shouldn't be reached - an error has occurred if we fall through to one.
-const errorTargetName = "ERROR"
+func init() {
+ // Standard targets include ACCEPT, DROP, RETURN, and JUMP.
+ registerTargetMaker(&standardTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&standardTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+
+ // Both user chains and actual errors are represented in iptables by
+ // error targets.
+ registerTargetMaker(&errorTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&errorTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+
+ registerTargetMaker(&redirectTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&nfNATTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+}
-// redirectTargetName is used to mark targets as redirect targets. Redirect
-// targets should be reached for only NAT and Mangle tables. These targets will
-// change the destination port/destination IP for packets.
-const redirectTargetName = "REDIRECT"
+type standardTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
-func marshalTarget(target stack.Target) []byte {
+func (sm *standardTargetMaker) id() stack.TargetID {
+ // Standard targets have the empty string as a name and no revisions.
+ return stack.TargetID{
+ NetworkProtocol: sm.NetworkProtocol,
+ }
+}
+func (*standardTargetMaker) marshal(target stack.Target) []byte {
+ // Translate verdicts the same way as the iptables tool.
+ var verdict int32
switch tg := target.(type) {
- case stack.AcceptTarget:
- return marshalStandardTarget(stack.RuleAccept)
- case stack.DropTarget:
- return marshalStandardTarget(stack.RuleDrop)
- case stack.ErrorTarget:
- return marshalErrorTarget(errorTargetName)
- case stack.UserChainTarget:
- return marshalErrorTarget(tg.Name)
- case stack.ReturnTarget:
- return marshalStandardTarget(stack.RuleReturn)
- case stack.RedirectTarget:
- return marshalRedirectTarget(tg)
- case JumpTarget:
- return marshalJumpTarget(tg)
+ case *stack.AcceptTarget:
+ verdict = -linux.NF_ACCEPT - 1
+ case *stack.DropTarget:
+ verdict = -linux.NF_DROP - 1
+ case *stack.ReturnTarget:
+ verdict = linux.NF_RETURN
+ case *JumpTarget:
+ verdict = int32(tg.Offset)
default:
panic(fmt.Errorf("unknown target of type %T", target))
}
-}
-
-func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
- nflog("convert to binary: marshalling standard target")
// The target's name will be the empty string.
- target := linux.XTStandardTarget{
+ xt := linux.XTStandardTarget{
Target: linux.XTEntryTarget{
TargetSize: linux.SizeOfXTStandardTarget,
},
- Verdict: translateFromStandardVerdict(verdict),
+ Verdict: verdict,
}
ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) != linux.SizeOfXTStandardTarget {
+ nflog("buf has wrong size for standard target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var standardTarget linux.XTStandardTarget
+ buf = buf[:linux.SizeOfXTStandardTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+ if standardTarget.Verdict < 0 {
+ // A Verdict < 0 indicates a non-jump verdict.
+ return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol())
+ }
+ // A verdict >= 0 indicates a jump.
+ return &JumpTarget{
+ Offset: uint32(standardTarget.Verdict),
+ NetworkProtocol: filter.NetworkProtocol(),
+ }, nil
+}
+
+type errorTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (em *errorTargetMaker) id() stack.TargetID {
+ // Error targets have no revision.
+ return stack.TargetID{
+ Name: stack.ErrorTargetName,
+ NetworkProtocol: em.NetworkProtocol,
+ }
}
-func marshalErrorTarget(errorName string) []byte {
+func (*errorTargetMaker) marshal(target stack.Target) []byte {
+ var errorName string
+ switch tg := target.(type) {
+ case *stack.ErrorTarget:
+ errorName = stack.ErrorTargetName
+ case *stack.UserChainTarget:
+ errorName = tg.Name
+ default:
+ panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
+ }
+
// This is an error target named error
- target := linux.XTErrorTarget{
+ xt := linux.XTErrorTarget{
Target: linux.XTEntryTarget{
TargetSize: linux.SizeOfXTErrorTarget,
},
}
- copy(target.Name[:], errorName)
- copy(target.Target.Name[:], errorTargetName)
+ copy(xt.Name[:], errorName)
+ copy(xt.Target.Name[:], stack.ErrorTargetName)
ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
}
-func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
+func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) != linux.SizeOfXTErrorTarget {
+ nflog("buf has insufficient size for error target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var errorTarget linux.XTErrorTarget
+ buf = buf[:linux.SizeOfXTErrorTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+ // Error targets are used in 2 cases:
+ // * An actual error case. These rules have an error
+ // named stack.ErrorTargetName. The last entry of the table
+ // is usually an error case to catch any packets that
+ // somehow fall through every rule.
+ // * To mark the start of a user defined chain. These
+ // rules have an error with the name of the chain.
+ switch name := errorTarget.Name.String(); name {
+ case stack.ErrorTargetName:
+ return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil
+ default:
+ // User defined chain.
+ return &stack.UserChainTarget{
+ Name: name,
+ NetworkProtocol: filter.NetworkProtocol(),
+ }, nil
+ }
+}
+
+type redirectTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *redirectTargetMaker) id() stack.TargetID {
+ return stack.TargetID{
+ Name: stack.RedirectTargetName,
+ NetworkProtocol: rm.NetworkProtocol,
+ }
+}
+
+func (*redirectTargetMaker) marshal(target stack.Target) []byte {
+ rt := target.(*stack.RedirectTarget)
// This is a redirect target named redirect
- target := linux.XTRedirectTarget{
+ xt := linux.XTRedirectTarget{
Target: linux.XTEntryTarget{
TargetSize: linux.SizeOfXTRedirectTarget,
},
}
- copy(target.Target.Name[:], redirectTargetName)
+ copy(xt.Target.Name[:], stack.RedirectTargetName)
ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
- target.NfRange.RangeSize = 1
- if rt.RangeProtoSpecified {
- target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+ xt.NfRange.RangeSize = 1
+ xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+ xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
+ xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) < linux.SizeOfXTRedirectTarget {
+ nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+ nflog("redirectTargetMaker: bad proto %d", p)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var redirectTarget linux.XTRedirectTarget
+ buf = buf[:linux.SizeOfXTRedirectTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+ // Copy linux.XTRedirectTarget to stack.RedirectTarget.
+ target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()}
+
+ // RangeSize should be 1.
+ nfRange := redirectTarget.NfRange
+ if nfRange.RangeSize != 1 {
+ nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Check if the flags are valid.
+ // Also check if we need to map ports or IP.
+ // For now, redirect target only supports destination port change.
+ // Port range and IP range are not supported yet.
+ if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+ nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Port range is not supported yet.
+ if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+ nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+ return nil, syserr.ErrInvalidArgument
}
- // Convert port from little endian to big endian.
- port := make([]byte, 2)
- binary.LittleEndian.PutUint16(port, rt.MinPort)
- target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
- binary.LittleEndian.PutUint16(port, rt.MaxPort)
- target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
- return binary.Marshal(ret, usermem.ByteOrder, target)
+ if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
+ nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+ target.Port = ntohs(nfRange.RangeIPV4.MinPort)
+
+ return &target, nil
}
-func marshalJumpTarget(jt JumpTarget) []byte {
- nflog("convert to binary: marshalling jump target")
+type nfNATTarget struct {
+ Target linux.XTEntryTarget
+ Range linux.NFNATRange
+}
- // The target's name will be the empty string.
- target := linux.XTStandardTarget{
+const nfNATMarhsalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange
+
+type nfNATTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *nfNATTargetMaker) id() stack.TargetID {
+ return stack.TargetID{
+ Name: stack.RedirectTargetName,
+ NetworkProtocol: rm.NetworkProtocol,
+ }
+}
+
+func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
+ rt := target.(*stack.RedirectTarget)
+ nt := nfNATTarget{
Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTStandardTarget,
+ TargetSize: nfNATMarhsalledSize,
+ },
+ Range: linux.NFNATRange{
+ Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
},
- // Verdict is overloaded by the ABI. When positive, it holds
- // the jump offset from the start of the table.
- Verdict: int32(jt.Offset),
}
+ copy(nt.Target.Name[:], stack.RedirectTargetName)
+ copy(nt.Range.MinAddr[:], rt.Addr)
+ copy(nt.Range.MaxAddr[:], rt.Addr)
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
+ nt.Range.MinProto = htons(rt.Port)
+ nt.Range.MaxProto = nt.Range.MinProto
+
+ ret := make([]byte, 0, nfNATMarhsalledSize)
+ return binary.Marshal(ret, usermem.ByteOrder, nt)
}
-// translateFromStandardVerdict translates verdicts the same way as the iptables
-// tool.
-func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
- switch verdict {
- case stack.RuleAccept:
- return -linux.NF_ACCEPT - 1
- case stack.RuleDrop:
- return -linux.NF_DROP - 1
- case stack.RuleReturn:
- return linux.NF_RETURN
- default:
- // TODO(gvisor.dev/issue/170): Support Jump.
- panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if size := nfNATMarhsalledSize; len(buf) < size {
+ nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
+ return nil, syserr.ErrInvalidArgument
}
+
+ if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+ nflog("nfNATTargetMaker: bad proto %d", p)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var natRange linux.NFNATRange
+ buf = buf[linux.SizeOfXTEntryTarget:nfNATMarhsalledSize]
+ binary.Unmarshal(buf, usermem.ByteOrder, &natRange)
+
+ // We don't support port or address ranges.
+ if natRange.MinAddr != natRange.MaxAddr {
+ nflog("nfNATTargetMaker: MinAddr and MaxAddr are different")
+ return nil, syserr.ErrInvalidArgument
+ }
+ if natRange.MinProto != natRange.MaxProto {
+ nflog("nfNATTargetMaker: MinProto and MaxProto are different")
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/3549): Check for other flags.
+ // For now, redirect target only supports destination change.
+ if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+ nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ target := stack.RedirectTarget{
+ NetworkProtocol: filter.NetworkProtocol(),
+ Addr: tcpip.Address(natRange.MinAddr[:]),
+ Port: ntohs(natRange.MinProto),
+ }
+
+ return &target, nil
}
// translateToStandardTarget translates from the value in a
// linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32) (stack.Target, error) {
+func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) {
// TODO(gvisor.dev/issue/170): Support other verdicts.
switch val {
case -linux.NF_ACCEPT - 1:
- return stack.AcceptTarget{}, nil
+ return &stack.AcceptTarget{NetworkProtocol: netProto}, nil
case -linux.NF_DROP - 1:
- return stack.DropTarget{}, nil
+ return &stack.DropTarget{NetworkProtocol: netProto}, nil
case -linux.NF_QUEUE - 1:
- return nil, errors.New("unsupported iptables verdict QUEUE")
+ nflog("unsupported iptables verdict QUEUE")
+ return nil, syserr.ErrInvalidArgument
case linux.NF_RETURN:
- return stack.ReturnTarget{}, nil
+ return &stack.ReturnTarget{NetworkProtocol: netProto}, nil
default:
- return nil, fmt.Errorf("unknown iptables verdict %d", val)
+ nflog("unknown iptables verdict %d", val)
+ return nil, syserr.ErrInvalidArgument
}
}
// parseTarget parses a target from optVal. optVal should contain only the
// target.
-func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) {
nflog("set entries: parsing target of size %d", len(optVal))
if len(optVal) < linux.SizeOfXTEntryTarget {
- return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
+ nflog("optVal has insufficient size for entry target %d", len(optVal))
+ return nil, syserr.ErrInvalidArgument
}
var target linux.XTEntryTarget
buf := optVal[:linux.SizeOfXTEntryTarget]
binary.Unmarshal(buf, usermem.ByteOrder, &target)
- switch target.Name.String() {
- case "":
- // Standard target.
- if len(optVal) != linux.SizeOfXTStandardTarget {
- return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
- }
- var standardTarget linux.XTStandardTarget
- buf = optVal[:linux.SizeOfXTStandardTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
-
- if standardTarget.Verdict < 0 {
- // A Verdict < 0 indicates a non-jump verdict.
- return translateToStandardTarget(standardTarget.Verdict)
- }
- // A verdict >= 0 indicates a jump.
- return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
-
- case errorTargetName:
- // Error target.
- if len(optVal) != linux.SizeOfXTErrorTarget {
- return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
- }
- var errorTarget linux.XTErrorTarget
- buf = optVal[:linux.SizeOfXTErrorTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
-
- // Error targets are used in 2 cases:
- // * An actual error case. These rules have an error
- // named errorTargetName. The last entry of the table
- // is usually an error case to catch any packets that
- // somehow fall through every rule.
- // * To mark the start of a user defined chain. These
- // rules have an error with the name of the chain.
- switch name := errorTarget.Name.String(); name {
- case errorTargetName:
- nflog("set entries: error target")
- return stack.ErrorTarget{}, nil
- default:
- // User defined chain.
- nflog("set entries: user-defined target %q", name)
- return stack.UserChainTarget{Name: name}, nil
- }
-
- case redirectTargetName:
- // Redirect target.
- if len(optVal) < linux.SizeOfXTRedirectTarget {
- return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
- }
-
- if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
- return nil, fmt.Errorf("netfilter.SetEntries: bad proto %d", p)
- }
-
- var redirectTarget linux.XTRedirectTarget
- buf = optVal[:linux.SizeOfXTRedirectTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
-
- // Copy linux.XTRedirectTarget to stack.RedirectTarget.
- var target stack.RedirectTarget
- nfRange := redirectTarget.NfRange
-
- // RangeSize should be 1.
- if nfRange.RangeSize != 1 {
- return nil, fmt.Errorf("netfilter.SetEntries: bad rangesize %d", nfRange.RangeSize)
- }
-
- // TODO(gvisor.dev/issue/170): Check if the flags are valid.
- // Also check if we need to map ports or IP.
- // For now, redirect target only supports destination port change.
- // Port range and IP range are not supported yet.
- if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid range flags %d", nfRange.RangeIPV4.Flags)
- }
- target.RangeProtoSpecified = true
-
- target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
- target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
-
- // TODO(gvisor.dev/issue/170): Port range is not supported yet.
- if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
- return nil, fmt.Errorf("netfilter.SetEntries: minport != maxport (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
- }
-
- // Convert port from big endian to little endian.
- port := make([]byte, 2)
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
- target.MinPort = binary.LittleEndian.Uint16(port)
-
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
- target.MaxPort = binary.LittleEndian.Uint16(port)
- return target, nil
- }
- // Unknown target.
- return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet", target.Name.String())
+ return unmarshalTarget(target, filter, optVal)
}
// JumpTarget implements stack.Target.
@@ -274,9 +376,31 @@ type JumpTarget struct {
// RuleNum is the rule to jump to.
RuleNum int
+
+ // NetworkProtocol is the network protocol the target is used with.
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (jt *JumpTarget) ID() stack.TargetID {
+ return stack.TargetID{
+ NetworkProtocol: jt.NetworkProtocol,
+ }
}
// Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
return stack.RuleJump, jt.RuleNum
}
+
+func ntohs(port uint16) uint16 {
+ buf := make([]byte, 2)
+ binary.BigEndian.PutUint16(buf, port)
+ return usermem.ByteOrder.Uint16(buf)
+}
+
+func htons(port uint16) uint16 {
+ buf := make([]byte, 2)
+ usermem.ByteOrder.PutUint16(buf, port)
+ return binary.BigEndian.Uint16(buf)
+}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 0bfd6c1f4..844acfede 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string {
// Match implements Matcher.Match.
func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
+ // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+ // into the stack.Check codepath as matchers are added.
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
+ }
- if netHeader.TransportProtocol() != header.TCPProtocolNumber {
- return false, false
- }
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
}
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 7ed05461d..63201201c 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string {
// Match implements Matcher.Match.
func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
-
// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
// into the stack.Check codepath as matchers are added.
- if netHeader.TransportProtocol() != header.UDPProtocolNumber {
- return false, false
- }
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
}
+
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 0546801bf..1f926aa91 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -16,6 +16,8 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/context",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
@@ -36,8 +38,6 @@ go_library(
"//pkg/tcpip",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
- "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go
index bb205be0d..e8930f031 100644
--- a/pkg/sentry/socket/netlink/provider_vfs2.go
+++ b/pkg/sentry/socket/netlink/provider_vfs2.go
@@ -52,6 +52,7 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol
vfsfd := &s.vfsfd
mnt := t.Kernel().SocketMount()
d := sockfs.NewDentry(t.Credentials(), mnt)
+ defer d.DecRef(t)
if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
DenyPWrite: true,
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 68a9b9a96..5ddcd4be5 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -21,6 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -38,8 +40,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
const sizeOfInt32 int = 4
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index a38d25da9..c83b23242 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -82,6 +82,13 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
return fd, nil
}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
+}
+
// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
return s.socketOpsCommon.Readiness(mask)
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 1fb777a6c..fae3b6783 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -22,6 +22,8 @@ go_library(
"//pkg/binary",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/safemem",
"//pkg/sentry/arch",
@@ -51,8 +53,6 @@ go_library(
"//pkg/tcpip/transport/udp",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
- "//tools/go_marshal/primitive",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 0bf21f7d8..87e30d742 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -40,6 +40,8 @@ import (
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -62,8 +64,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
func mustCreateMetric(name, description string) *tcpip.StatCounter {
@@ -158,6 +158,9 @@ var Metrics = tcpip.Stats{
OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+ IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."),
+ IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
+ IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
},
TCP: tcpip.TCPStats{
ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
@@ -195,7 +198,6 @@ var Metrics = tcpip.Stats{
PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
- InvalidSourceAddress: mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
},
}
@@ -857,7 +859,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
@@ -866,7 +868,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
// Try to accept the connection again; if it fails, then wait until we
// get a notification.
for {
- if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+ if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock {
return ep, wq, syserr.TranslateNetstackError(err)
}
@@ -879,15 +881,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, wq, terr := s.Endpoint.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, wq, terr := s.Endpoint.Accept(peerAddr)
if terr != nil {
if terr != tcpip.ErrWouldBlock || !blocking {
return 0, nil, 0, syserr.TranslateNetstackError(terr)
}
var err *syserr.Error
- ep, wq, err = s.blockingAccept(t)
+ ep, wq, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -907,13 +912,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer and write it to peer slice.
- var err *syserr.Error
- addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = ConvertAddress(s.family, *peerAddr)
}
fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -1187,7 +1187,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
var v tcpip.LingerOption
var linger linux.Linger
if err := ep.GetSockOpt(&v); err != nil {
- return &linger, nil
+ return nil, syserr.TranslateNetstackError(err)
}
if v.Enabled {
@@ -1512,8 +1512,17 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
return &vP, nil
case linux.IP6T_ORIGINAL_DST:
- // TODO(gvisor.dev/issue/170): ip6tables.
- return nil, syserr.ErrInvalidArgument
+ if outLen < int(binary.Size(linux.SockAddrInet6{})) {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.OriginalDestinationOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
+ return a.(*linux.SockAddrInet6), nil
case linux.IP6T_SO_GET_INFO:
if outLen < linux.SizeOfIPTGetinfo {
@@ -1555,6 +1564,26 @@ func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name
}
return &entries, nil
+ case linux.IP6T_SO_GET_REVISION_TARGET:
+ if outLen < linux.SizeOfXTGetRevision {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv6 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return &ret, nil
+
default:
emitUnimplementedEventIPv6(t, name)
}
@@ -1718,6 +1747,26 @@ func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in
}
return &entries, nil
+ case linux.IPT_SO_GET_REVISION_TARGET:
+ if outLen < linux.SizeOfXTGetRevision {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv4 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return &ret, nil
+
default:
emitUnimplementedEventIP(t, name)
}
@@ -1770,10 +1819,16 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
case linux.SOL_IP:
return setSockOptIP(t, s, ep, name, optVal)
+ case linux.SOL_PACKET:
+ // gVisor doesn't support any SOL_PACKET options just return not
+ // supported. Returning nil here will result in tcpdump thinking AF_PACKET
+ // features are supported and proceed to use them and break.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return syserr.ErrProtocolNotAvailable
+
case linux.SOL_UDP,
linux.SOL_ICMPV6,
- linux.SOL_RAW,
- linux.SOL_PACKET:
+ linux.SOL_RAW:
t.Kernel().EmitUnimplementedEvent(t)
}
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 1f7d17f5f..4c6791fff 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -18,6 +18,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
@@ -29,8 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// SocketVFS2 encapsulates all the state needed to represent a network stack
@@ -79,6 +79,13 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
return vfsfd, nil
}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
+}
+
// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
return s.socketOpsCommon.Readiness(mask)
@@ -151,14 +158,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
// tcpip.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
- ep, wq, terr := s.Endpoint.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, wq, terr := s.Endpoint.Accept(peerAddr)
if terr != nil {
if terr != tcpip.ErrWouldBlock || !blocking {
return 0, nil, 0, syserr.TranslateNetstackError(terr)
}
var err *syserr.Error
- ep, wq, err = s.blockingAccept(t)
+ ep, wq, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -176,13 +187,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
+ if peerAddr != nil {
// Get address of the peer and write it to peer slice.
- var err *syserr.Error
- addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ addr, addrLen = ConvertAddress(s.family, *peerAddr)
}
fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f0fe18684..1028d2a6e 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -155,7 +155,7 @@ func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
- var rs tcp.ReceiveBufferSizeOption
+ var rs tcpip.TCPReceiveBufferSizeRangeOption
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
return inet.TCPBufferSize{
Min: rs.Min,
@@ -166,17 +166,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
- rs := tcp.ReceiveBufferSizeOption{
+ rs := tcpip.TCPReceiveBufferSizeRangeOption{
Min: size.Min,
Default: size.Default,
Max: size.Max,
}
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
}
// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
- var ss tcp.SendBufferSizeOption
+ var ss tcpip.TCPSendBufferSizeRangeOption
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
return inet.TCPBufferSize{
Min: ss.Min,
@@ -187,29 +187,30 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
- ss := tcp.SendBufferSizeOption{
+ ss := tcpip.TCPSendBufferSizeRangeOption{
Min: size.Min,
Default: size.Default,
Max: size.Max,
}
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
}
// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
func (s *Stack) TCPSACKEnabled() (bool, error) {
- var sack tcp.SACKEnabled
+ var sack tcpip.TCPSACKEnabled
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
return bool(sack), syserr.TranslateNetstackError(err).ToError()
}
// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+ opt := tcpip.TCPSACKEnabled(enabled)
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
}
// TCPRecovery implements inet.Stack.TCPRecovery.
func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
- var recovery tcp.Recovery
+ var recovery tcpip.TCPRecovery
if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
return 0, syserr.TranslateNetstackError(err).ToError()
}
@@ -218,7 +219,8 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+ opt := tcpip.TCPRecovery(recovery)
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
}
// Statistics implements inet.Stack.Statistics.
@@ -410,3 +412,24 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
s.Stack.RestoreCleanupEndpoints(es)
}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ switch protocol {
+ case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+ return s.Stack.Forwarding(protocol)
+ default:
+ panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol))
+ }
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+ switch protocol {
+ case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+ s.Stack.SetForwarding(protocol, enable)
+ default:
+ panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol))
+ }
+ return nil
+}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 04b259d27..fd31479e5 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -35,7 +36,6 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// ControlMessages represents the union of unix control messages and tcpip
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index cb953e4dc..cc7408698 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -7,10 +7,21 @@ go_template_instance(
name = "socket_refs",
out = "socket_refs.go",
package = "unix",
- prefix = "socketOpsCommon",
+ prefix = "socketOperations",
template = "//pkg/refs_vfs2:refs_template",
types = {
- "T": "socketOpsCommon",
+ "T": "SocketOperations",
+ },
+)
+
+go_template_instance(
+ name = "socket_vfs2_refs",
+ out = "socket_vfs2_refs.go",
+ package = "unix",
+ prefix = "socketVFS2",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "SocketVFS2",
},
)
@@ -20,6 +31,7 @@ go_library(
"device.go",
"io.go",
"socket_refs.go",
+ "socket_vfs2_refs.go",
"unix.go",
"unix_vfs2.go",
],
@@ -29,6 +41,7 @@ go_library(
"//pkg/context",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
@@ -49,6 +62,5 @@ go_library(
"//pkg/tcpip",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index e3a75b519..aa4f3c04d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
}
// Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
e.Lock()
defer e.Unlock()
@@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
select {
case ne := <-e.acceptedChan:
+ if peerAddr != nil {
+ ne.Lock()
+ c := ne.connected
+ ne.Unlock()
+ if c != nil {
+ addr, err := c.GetLocalAddress()
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ *peerAddr = addr
+ }
+ }
return ne, nil
default:
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 4751b2fd8..f8aacca13 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
}
// Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+func (*connectionlessEndpoint) Listen(int) *syserr.Error {
return syserr.ErrNotSupported
}
// Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
return nil, syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index cc9d650fb..d6fc03520 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -151,7 +151,10 @@ type Endpoint interface {
// block if no new connections are available.
//
// The returned Queue is the wait queue for the newly created endpoint.
- Accept() (Endpoint, *syserr.Error)
+ //
+ // peerAddr if not nil will be populated with the address of the connected
+ // peer on a successful accept.
+ Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
// Bind binds the endpoint to a specific local address and port.
// Specifying a NIC is optional.
@@ -743,6 +746,9 @@ type baseEndpoint struct {
// path is not empty if the endpoint has been bound,
// or may be used if the endpoint is connected.
path string
+
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -838,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return n, err
}
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+ switch v := opt.(type) {
+ case *tcpip.LingerOption:
+ e.Lock()
+ e.linger = *v
+ e.Unlock()
+ }
return nil
}
@@ -942,8 +954,11 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch o := opt.(type) {
case *tcpip.LingerOption:
+ e.Lock()
+ *o = e.linger
+ e.Unlock()
return nil
default:
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 0a7a26495..f80011ce4 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -39,7 +40,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// SocketOperations is a Unix socket. It is similar to a netstack socket,
@@ -55,6 +55,7 @@ type SocketOperations struct {
fsutil.FileNoopFlush `state:"nosave"`
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ socketOperationsRefs
socketOpsCommon
}
@@ -84,11 +85,27 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
return fs.NewFile(ctx, d, flags, &s)
}
+// DecRef implements RefCounter.DecRef.
+func (s *SocketOperations) DecRef(ctx context.Context) {
+ s.socketOperationsRefs.DecRef(func() {
+ s.ep.Close(ctx)
+ if s.abstractNamespace != nil {
+ s.abstractNamespace.Remove(s.abstractName, s)
+ }
+ })
+}
+
+// Release implemements fs.FileOperations.Release.
+func (s *SocketOperations) Release(ctx context.Context) {
+ // Release only decrements a reference on s because s may be referenced in
+ // the abstract socket namespace.
+ s.DecRef(ctx)
+}
+
// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
//
// +stateify savable
type socketOpsCommon struct {
- socketOpsCommonRefs
socket.SendReceiveTimeout
ep transport.Endpoint
@@ -101,23 +118,6 @@ type socketOpsCommon struct {
abstractNamespace *kernel.AbstractSocketNamespace
}
-// DecRef implements RefCounter.DecRef.
-func (s *socketOpsCommon) DecRef(ctx context.Context) {
- s.socketOpsCommonRefs.DecRef(func() {
- s.ep.Close(ctx)
- if s.abstractNamespace != nil {
- s.abstractNamespace.Remove(s.abstractName, s)
- }
- })
-}
-
-// Release implemements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(ctx context.Context) {
- // Release only decrements a reference on s because s may be referenced in
- // the abstract socket namespace.
- s.DecRef(ctx)
-}
-
func (s *socketOpsCommon) isPacket() bool {
switch s.stype {
case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
@@ -205,7 +205,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
@@ -214,7 +214,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Try to accept the connection; if it fails, then wait until we get a
// notification.
for {
- if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+ if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
return ep, err
}
@@ -227,15 +227,18 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, err := s.ep.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, err := s.ep.Accept(peerAddr)
if err != nil {
if err != syserr.ErrWouldBlock || !blocking {
return 0, nil, 0, err
}
var err *syserr.Error
- ep, err = s.blockingAccept(t)
+ ep, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -252,13 +255,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer.
- var err *syserr.Error
- addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
}
fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 65a285b8f..3345124cc 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
@@ -32,17 +33,19 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// SocketVFS2 implements socket.SocketVFS2 (and by extension,
// vfs.FileDescriptionImpl) for Unix sockets.
+//
+// +stateify savable
type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
vfs.LockFD
+ socketVFS2Refs
socketOpsCommon
}
@@ -53,6 +56,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{})
func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
mnt := t.Kernel().SocketMount()
d := sockfs.NewDentry(t.Credentials(), mnt)
+ defer d.DecRef(t)
fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
if err != nil {
@@ -88,6 +92,25 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
return vfsfd, nil
}
+// DecRef implements RefCounter.DecRef.
+func (s *SocketVFS2) DecRef(ctx context.Context) {
+ s.socketVFS2Refs.DecRef(func() {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.ep.Close(ctx)
+ if s.abstractNamespace != nil {
+ s.abstractNamespace.Remove(s.abstractName, s)
+ }
+ })
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ // Release only decrements a reference on s because s may be referenced in
+ // the abstract socket namespace.
+ s.DecRef(ctx)
+}
+
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
@@ -96,7 +119,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
@@ -105,7 +128,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
// Try to accept the connection; if it fails, then wait until we get a
// notification.
for {
- if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+ if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
return ep, err
}
@@ -118,15 +141,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, err := s.ep.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, err := s.ep.Accept(peerAddr)
if err != nil {
if err != syserr.ErrWouldBlock || !blocking {
return 0, nil, 0, err
}
var err *syserr.Error
- ep, err = s.blockingAccept(t)
+ ep, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -144,13 +170,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer.
- var err *syserr.Error
- addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
}
fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index a06c9b8ab..245d2c5cf 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -61,8 +61,10 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
log.Infof("Sandbox save started, pausing all tasks.")
k.Pause()
k.ReceiveTaskStates()
- defer k.Unpause()
- defer log.Infof("Tasks resumed after save.")
+ defer func() {
+ k.Unpause()
+ log.Infof("Tasks resumed after save.")
+ }()
w.Stop()
defer w.Start()
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 88d5db9fc..a920180d3 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -28,6 +28,7 @@ go_library(
"//pkg/binary",
"//pkg/bits",
"//pkg/eventchannel",
+ "//pkg/marshal/primitive",
"//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index 5d51a7792..ae3b998c8 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -26,7 +26,7 @@ import (
func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
var e linux.EpollEvent
- if _, err := t.CopyIn(eventAddr, &e); err != nil {
+ if _, err := e.CopyIn(t, eventAddr); err != nil {
return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
}
var sb strings.Builder
@@ -41,7 +41,7 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
addr := eventsAddr
for i := uint64(0); i < numEvents; i++ {
var e linux.EpollEvent
- if _, err := t.CopyIn(addr, &e); err != nil {
+ if _, err := e.CopyIn(t, addr); err != nil {
fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
continue
}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 08e97e6c4..cc5f70cd4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
@@ -166,7 +167,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
}
buf := make([]byte, length)
- if _, err := t.CopyIn(addr, &buf); err != nil {
+ if _, err := t.CopyInBytes(addr, buf); err != nil {
return fmt.Sprintf("%#x (error decoding control: %v)", addr, err)
}
@@ -302,7 +303,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string {
var msg slinux.MessageHeader64
- if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err)
}
s := fmt.Sprintf(
@@ -380,9 +381,9 @@ func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) str
func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) {
// socklen_t is 32-bits.
- var l uint32
- _, err := t.CopyIn(addr, &l)
- return l, err
+ var l primitive.Uint32
+ _, err := l.CopyIn(t, addr)
+ return uint32(l), err
}
func sockLenPointer(t *kernel.Task, addr usermem.Addr) string {
@@ -436,22 +437,22 @@ func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, o
func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
switch optLen {
case 1:
- var v uint8
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint8
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
return fmt.Sprintf("%#x {value=%v}", optVal, v)
case 2:
- var v uint16
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint16
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
return fmt.Sprintf("%#x {value=%v}", optVal, v)
case 4:
- var v uint32
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint32
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 87b239730..396744597 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -17,17 +17,16 @@
package strace
import (
- "encoding/binary"
"fmt"
"strconv"
"strings"
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -91,7 +90,7 @@ func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, ma
}
b := make([]byte, size)
- amt, err := t.CopyIn(ar.Start, b)
+ amt, err := t.CopyInBytes(ar.Start, b)
if err != nil {
iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err)
continue
@@ -118,7 +117,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st
}
b := make([]byte, size)
- amt, err := t.CopyIn(addr, b)
+ amt, err := t.CopyInBytes(addr, b)
if err != nil {
return fmt.Sprintf("%#x (error decoding string: %s)", addr, err)
}
@@ -199,7 +198,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
func fdpair(t *kernel.Task, addr usermem.Addr) string {
var fds [2]int32
- _, err := t.CopyIn(addr, &fds)
+ _, err := primitive.CopyInt32SliceIn(t, addr, fds[:])
if err != nil {
return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err)
}
@@ -209,7 +208,7 @@ func fdpair(t *kernel.Task, addr usermem.Addr) string {
func uname(t *kernel.Task, addr usermem.Addr) string {
var u linux.UtsName
- if _, err := t.CopyIn(addr, &u); err != nil {
+ if _, err := u.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err)
}
@@ -222,7 +221,7 @@ func utimensTimespec(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timespec
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
}
@@ -244,7 +243,7 @@ func timespec(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timespec
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
}
return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec)
@@ -256,7 +255,7 @@ func timeval(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timeval
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err)
}
@@ -268,8 +267,8 @@ func utimbuf(t *kernel.Task, addr usermem.Addr) string {
return "null"
}
- var utim syscall.Utimbuf
- if _, err := t.CopyIn(addr, &utim); err != nil {
+ var utim linux.Utime
+ if _, err := utim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err)
}
@@ -282,7 +281,7 @@ func stat(t *kernel.Task, addr usermem.Addr) string {
}
var stat linux.Stat
- if _, err := t.CopyIn(addr, &stat); err != nil {
+ if _, err := stat.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err)
}
return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec))
@@ -294,7 +293,7 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string {
}
interval := timeval(t, addr)
- value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{})))
+ value := timeval(t, addr+usermem.Addr((*linux.Timeval)(nil).SizeBytes()))
return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
}
@@ -304,7 +303,7 @@ func itimerspec(t *kernel.Task, addr usermem.Addr) string {
}
interval := timespec(t, addr)
- value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{})))
+ value := timespec(t, addr+usermem.Addr((*linux.Timespec)(nil).SizeBytes()))
return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
}
@@ -330,7 +329,7 @@ func rusage(t *kernel.Task, addr usermem.Addr) string {
}
var ru linux.Rusage
- if _, err := t.CopyIn(addr, &ru); err != nil {
+ if _, err := ru.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err)
}
return fmt.Sprintf("%#x %+v", addr, ru)
@@ -342,7 +341,7 @@ func capHeader(t *kernel.Task, addr usermem.Addr) string {
}
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(addr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding header: %s)", addr, err)
}
@@ -367,7 +366,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
}
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err)
}
@@ -376,7 +375,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
switch hdr.Version {
case linux.LINUX_CAPABILITY_VERSION_1:
var data linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := data.CopyIn(t, dataAddr); err != nil {
return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
}
p = uint64(data.Permitted)
@@ -384,7 +383,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
e = uint64(data.Effective)
case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
var data [2]linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
}
p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4a9b04fd0..75752b2e6 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -56,6 +56,7 @@ go_library(
"sys_xattr.go",
"timespec.go",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
"//pkg/abi",
@@ -64,6 +65,8 @@ go_library(
"//pkg/bpf",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/rand",
"//pkg/safemem",
@@ -99,7 +102,5 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
- "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index da6bd85e1..5f26697d2 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -138,7 +138,7 @@ var AMD64 = &kernel.SyscallTable{
83: syscalls.Supported("mkdir", Mkdir),
84: syscalls.Supported("rmdir", Rmdir),
85: syscalls.Supported("creat", Creat),
- 86: syscalls.Supported("link", Link),
+ 86: syscalls.PartiallySupported("link", Link, "Limited support with Gofer. Link count and linked files may get out of sync because gVisor is not aware of external hardlinks.", nil),
87: syscalls.Supported("unlink", Unlink),
88: syscalls.Supported("symlink", Symlink),
89: syscalls.Supported("readlink", Readlink),
@@ -317,7 +317,7 @@ var AMD64 = &kernel.SyscallTable{
262: syscalls.Supported("fstatat", Fstatat),
263: syscalls.Supported("unlinkat", Unlinkat),
264: syscalls.Supported("renameat", Renameat),
- 265: syscalls.Supported("linkat", Linkat),
+ 265: syscalls.PartiallySupported("linkat", Linkat, "See link(2).", nil),
266: syscalls.Supported("symlinkat", Symlinkat),
267: syscalls.Supported("readlinkat", Readlinkat),
268: syscalls.Supported("fchmodat", Fchmodat),
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index e9d64dec5..0bf313a13 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -17,6 +17,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -36,7 +37,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
//
// The context pointer _must_ be zero initially.
var idIn uint64
- if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+ if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
return 0, nil, err
}
if idIn != 0 {
@@ -49,7 +50,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Copy out the new ID.
- if _, err := t.CopyOut(idAddr, &id); err != nil {
+ if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
t.MemoryManager().DestroyAIOContext(t, id)
return 0, nil, err
}
@@ -142,7 +143,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
ev := v.(*linux.IOEvent)
// Copy out the result.
- if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+ if _, err := ev.CopyOut(t, eventsAddr); err != nil {
if count > 0 {
return uintptr(count), nil, nil
}
@@ -338,21 +339,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
for i := int32(0); i < nrEvents; i++ {
- // Copy in the address.
- cbAddrNative := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
- if i > 0 {
- // Some successful.
- return uintptr(i), nil, nil
+ // Copy in the callback address.
+ var cbAddr usermem.Addr
+ switch t.Arch().Width() {
+ case 8:
+ var cbAddrP primitive.Uint64
+ if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
}
- // Nothing done.
- return 0, nil, err
+ cbAddr = usermem.Addr(cbAddrP)
+ default:
+ return 0, nil, syserror.ENOSYS
}
// Copy in this callback.
var cb linux.IOCallback
- cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
- if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+ if _, err := cb.CopyIn(t, cbAddr); err != nil {
if i > 0 {
// Some have been successful.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index adf5ea5f2..d3b85e11b 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -45,7 +45,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
dataAddr := args[1].Pointer()
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return 0, nil, err
}
// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
@@ -65,7 +65,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
Permitted: uint32(p),
Inheritable: uint32(i),
}
- _, err = t.CopyOut(dataAddr, &data)
+ _, err = data.CopyOut(t, dataAddr)
return 0, nil, err
case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
@@ -88,12 +88,12 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
Inheritable: uint32(i >> 32),
},
}
- _, err = t.CopyOut(dataAddr, &data)
+ _, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:])
return 0, nil, err
default:
hdr.Version = linux.HighestCapabilityVersion
- if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
return 0, nil, err
}
if dataAddr != 0 {
@@ -109,7 +109,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
dataAddr := args[1].Pointer()
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return 0, nil, err
}
switch hdr.Version {
@@ -118,7 +118,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.EPERM
}
var data linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := data.CopyIn(t, dataAddr); err != nil {
return 0, nil, err
}
p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
@@ -131,7 +131,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.EPERM
}
var data [2]linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
return 0, nil, err
}
p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
@@ -141,7 +141,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
default:
hdr.Version = linux.HighestCapabilityVersion
- if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
return 0, nil, err
}
return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 256422689..98331eb3c 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
@@ -601,19 +602,19 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Shared flags between file and socket.
switch request {
case linux.FIONCLEX:
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
case linux.FIONBIO:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.Flags()
@@ -627,7 +628,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FIOASYNC:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.Flags()
@@ -641,15 +642,14 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FIOSETOWN, linux.SIOCSPGRP:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
fSetOwn(t, file, set)
return 0, nil, nil
case linux.FIOGETOWN, linux.SIOCGPGRP:
- who := fGetOwn(t, file)
- _, err := t.CopyOut(args[2].Pointer(), &who)
+ _, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
return 0, nil, err
default:
@@ -694,7 +694,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
// Top it off with a terminator.
- _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+ _, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00"))
return uintptr(bytes + 1), nil, err
}
@@ -787,7 +787,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- file, _ := t.FDTable().Remove(fd)
+ file, _ := t.FDTable().Remove(t, fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -941,7 +941,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
return 0, nil, err
@@ -962,7 +962,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Copy in the lock request.
flockAddr := args[2].Pointer()
var flock linux.Flock
- if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ if _, err := flock.CopyIn(t, flockAddr); err != nil {
return 0, nil, err
}
@@ -1052,12 +1052,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_GETOWN_EX:
addr := args[2].Pointer()
owner := fGetOwnEx(t, file)
- _, err := t.CopyOut(addr, &owner)
+ _, err := owner.CopyOut(t, addr)
return 0, nil, err
case linux.F_SETOWN_EX:
addr := args[2].Pointer()
var owner linux.FOwnerEx
- _, err := t.CopyIn(addr, &owner)
+ _, err := owner.CopyIn(t, addr)
if err != nil {
return 0, nil, err
}
@@ -1154,6 +1154,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
+// LINT.ThenChange(vfs2/fd.go)
+
+// LINT.IfChange
+
func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@@ -1918,7 +1922,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times linux.Utime
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := times.CopyIn(t, timesAddr); err != nil {
return 0, nil, err
}
ts = fs.TimeSpec{
@@ -1938,7 +1942,7 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
ts = fs.TimeSpec{
@@ -1966,7 +1970,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timespec
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
@@ -2000,7 +2004,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 12b2fa690..f39ce0639 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -306,8 +306,8 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
// Despite the syscall using the name 'pid' for this variable, it is
// very much a tid.
tid := args[0].Int()
- head := args[1].Pointer()
- size := args[2].Pointer()
+ headAddr := args[1].Pointer()
+ sizeAddr := args[2].Pointer()
if tid < 0 {
return 0, nil, syserror.EINVAL
@@ -321,12 +321,16 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
}
// Copy out head pointer.
- if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil {
+ head := t.Arch().Native(uintptr(ot.GetRobustList()))
+ if _, err := head.CopyOut(t, headAddr); err != nil {
return 0, nil, err
}
- // Copy out size, which is a constant.
- if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil {
+ // Copy out size, which is a constant. Note that while size isn't
+ // an address, it is defined as the arch-dependent size_t, so it
+ // needs to be converted to a native-sized int.
+ size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
+ if _, err := size.CopyOut(t, sizeAddr); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 59004cefe..b25f7d881 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -19,7 +19,6 @@ import (
"io"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -93,19 +92,23 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
}
}
-// oldDirentHdr is a fixed sized header matching the fixed size
-// fields found in the old linux dirent struct.
+// oldDirentHdr is a fixed sized header matching the fixed size fields found in
+// the old linux dirent struct.
+//
+// +marshal
type oldDirentHdr struct {
Ino uint64
Off uint64
- Reclen uint16
+ Reclen uint16 `marshal:"unaligned"` // Struct ends mid-word.
}
-// direntHdr is a fixed sized header matching the fixed size
-// fields found in the new linux dirent struct.
+// direntHdr is a fixed sized header matching the fixed size fields found in the
+// new linux dirent struct.
+//
+// +marshal
type direntHdr struct {
OldHdr oldDirentHdr
- Typ uint8
+ Typ uint8 `marshal:"unaligned"` // Struct ends mid-word.
}
// dirent contains the data pointed to by a new linux dirent struct.
@@ -134,20 +137,20 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
// the old linux dirent format.
func smallestDirent(a arch.Context) uint {
d := dirent{}
- return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+ return uint(d.Hdr.OldHdr.SizeBytes()) + a.Width() + 1
}
// smallestDirent64 returns the size of the smallest possible dirent using
// the new linux dirent format.
func smallestDirent64(a arch.Context) uint {
d := dirent{}
- return uint(binary.Size(d.Hdr)) + a.Width()
+ return uint(d.Hdr.SizeBytes()) + a.Width()
}
// padRec pads the name field until the rec length is a multiple of the width,
// which must be a power of 2. It returns the padded rec length.
func (d *dirent) padRec(width int) uint16 {
- a := int(binary.Size(d.Hdr)) + len(d.Name)
+ a := d.Hdr.SizeBytes() + len(d.Name)
r := (a + width) &^ (width - 1)
padding := r - a
d.Name = append(d.Name, make([]byte, padding)...)
@@ -157,7 +160,7 @@ func (d *dirent) padRec(width int) uint16 {
// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
// linux dirent format. Returns the number of bytes serialized or an error.
func (d *dirent) Serialize64(w io.Writer) (int, error) {
- n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+ n1, err := d.Hdr.WriteTo(w)
if err != nil {
return 0, err
}
@@ -165,14 +168,14 @@ func (d *dirent) Serialize64(w io.Writer) (int, error) {
if err != nil {
return 0, err
}
- return n1 + n2, nil
+ return int(n1) + n2, nil
}
// Serialize serializes a Dirent struct to a byte slice, using the old linux
// dirent format.
// Returns the number of bytes serialized or an error.
func (d *dirent) Serialize(w io.Writer) (int, error) {
- n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+ n1, err := d.Hdr.OldHdr.WriteTo(w)
if err != nil {
return 0, err
}
@@ -184,7 +187,7 @@ func (d *dirent) Serialize(w io.Writer) (int, error) {
if err != nil {
return 0, err
}
- return n1 + n2 + n3, nil
+ return int(n1) + n2 + n3, nil
}
// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 715ac45e6..a29d307e5 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -49,13 +49,13 @@ func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
- if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+ if _, err := ruid.CopyOut(t, ruidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(euidAddr, euid); err != nil {
+ if _, err := euid.CopyOut(t, euidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(suidAddr, suid); err != nil {
+ if _, err := suid.CopyOut(t, suidAddr); err != nil {
return 0, nil, err
}
return 0, nil, nil
@@ -84,13 +84,13 @@ func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
- if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+ if _, err := rgid.CopyOut(t, rgidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(egidAddr, egid); err != nil {
+ if _, err := egid.CopyOut(t, egidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+ if _, err := sgid.CopyOut(t, sgidAddr); err != nil {
return 0, nil, err
}
return 0, nil, nil
@@ -157,7 +157,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
for i, kgid := range kgids {
gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
}
- if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+ if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil {
return 0, nil, err
}
return uintptr(len(gids)), nil, nil
@@ -173,7 +173,7 @@ func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, t.SetExtraGIDs(nil)
}
gids := make([]auth.GID, size)
- if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+ if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil {
return 0, nil, err
}
return 0, nil, t.SetExtraGIDs(gids)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index d0109baa4..cd8dfdfa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with a special mappable.
+ opts.Offset = 0
+ m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
+ if err != nil {
+ return 0, nil, err
+ }
+ opts.MappingIdentity = m // transfers ownership of m to opts
+ opts.Mappable = m
}
rv, err := t.MemoryManager().MMap(t, opts)
@@ -239,7 +248,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
return 0, nil, syserror.ENOMEM
}
resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
- _, err := t.CopyOut(vec, resident)
+ _, err := t.CopyOutBytes(vec, resident)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3149e4aad..849a47476 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -46,9 +47,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
return 0, err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if file, _ := t.FDTable().Remove(fd); file != nil {
+ if file, _ := t.FDTable().Remove(t, fd); file != nil {
file.DecRef(t)
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 789e2ed5b..254f4c9f9 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -162,7 +162,7 @@ func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
pfd := make([]linux.PollFD, nfds)
if nfds > 0 {
- if _, err := t.CopyIn(addr, &pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
return nil, err
}
}
@@ -189,7 +189,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
- if _, err := t.CopyOut(addr, pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
@@ -202,7 +202,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
set := make([]byte, nBytes)
if addr != 0 {
- if _, err := t.CopyIn(addr, &set); err != nil {
+ if _, err := t.CopyInBytes(addr, set); err != nil {
return nil, err
}
// If we only use part of the last byte, mask out the extraneous bits.
@@ -329,19 +329,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// Copy updated vectors back.
if readFDs != 0 {
- if _, err := t.CopyOut(readFDs, r); err != nil {
+ if _, err := t.CopyOutBytes(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
- if _, err := t.CopyOut(writeFDs, w); err != nil {
+ if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
- if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
return 0, err
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 64a725296..a892d2c62 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -43,7 +44,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, nil
case linux.PR_GET_PDEATHSIG:
- _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+ _, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal()))
return 0, nil, err
case linux.PR_GET_DUMPABLE:
@@ -110,7 +111,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
buf[len] = 0
len++
}
- _, err := t.CopyOut(addr, buf[:len])
+ _, err := t.CopyOutBytes(addr, buf[:len])
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d5d5b6959..309c183a3 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -26,17 +27,13 @@ import (
// rlimit describes an implementation of 'struct rlimit', which may vary from
// system-to-system.
type rlimit interface {
+ marshal.Marshallable
+
// toLimit converts an rlimit to a limits.Limit.
toLimit() *limits.Limit
// fromLimit converts a limits.Limit to an rlimit.
fromLimit(lim limits.Limit)
-
- // copyIn copies an rlimit from the untrusted app to the kernel.
- copyIn(t *kernel.Task, addr usermem.Addr) error
-
- // copyOut copies an rlimit from the kernel to the untrusted app.
- copyOut(t *kernel.Task, addr usermem.Addr) error
}
// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
@@ -50,6 +47,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) {
}
}
+// +marshal
type rlimit64 struct {
Cur uint64
Max uint64
@@ -70,12 +68,12 @@ func (r *rlimit64) fromLimit(lim limits.Limit) {
}
func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
- _, err := t.CopyIn(addr, r)
+ _, err := r.CopyIn(t, addr)
return err
}
func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
- _, err := t.CopyOut(addr, *r)
+ _, err := r.CopyOut(t, addr)
return err
}
@@ -140,7 +138,8 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, err
}
rlim.fromLimit(lim)
- return 0, nil, rlim.copyOut(t, addr)
+ _, err = rlim.CopyOut(t, addr)
+ return 0, nil, err
}
// Setrlimit implements linux syscall setrlimit(2).
@@ -155,7 +154,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if err != nil {
return 0, nil, err
}
- if err := rlim.copyIn(t, addr); err != nil {
+ if _, err := rlim.CopyIn(t, addr); err != nil {
return 0, nil, syserror.EFAULT
}
_, err = prlimit64(t, resource, rlim.toLimit())
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 1674c7445..ac5c98a54 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -80,7 +80,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
ru := getrusage(t, which)
- _, err := t.CopyOut(addr, &ru)
+ _, err := ru.CopyOut(t, addr)
return 0, nil, err
}
@@ -104,7 +104,7 @@ func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
CUTime: linux.ClockTFromDuration(cs2.UserTime),
CSTime: linux.ClockTFromDuration(cs2.SysTime),
}
- if _, err := t.CopyOut(addr, &r); err != nil {
+ if _, err := r.CopyOut(t, addr); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index 99f6993f5..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -27,8 +27,10 @@ const (
)
// SchedParam replicates struct sched_param in sched.h.
+//
+// +marshal
type SchedParam struct {
- schedPriority int64
+ schedPriority int32
}
// SchedGetparam implements linux syscall sched_getparam(2).
@@ -45,7 +47,7 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.ESRCH
}
r := SchedParam{schedPriority: onlyPriority}
- if _, err := t.CopyOut(param, r); err != nil {
+ if _, err := r.CopyOut(t, param); err != nil {
return 0, nil, err
}
@@ -79,7 +81,7 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke
return 0, nil, syserror.ESRCH
}
var r SchedParam
- if _, err := t.CopyIn(param, &r); err != nil {
+ if _, err := r.CopyIn(t, param); err != nil {
return 0, nil, syserror.EINVAL
}
if r.schedPriority != onlyPriority {
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 5b7a66f4d..4fdb4463c 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -24,6 +24,8 @@ import (
)
// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+//
+// +marshal
type userSockFprog struct {
// Len is the length of the filter in BPF instructions.
Len uint16
@@ -33,7 +35,7 @@ type userSockFprog struct {
// Filter is a user pointer to the struct sock_filter array that makes up
// the filter program. Filter is a uint64 rather than a usermem.Addr
// because usermem.Addr is actually uintptr, which is not a fixed-size
- // type, and encoding/binary.Read objects to this.
+ // type.
Filter uint64
}
@@ -54,11 +56,11 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
}
var fprog userSockFprog
- if _, err := t.CopyIn(addr, &fprog); err != nil {
+ if _, err := fprog.CopyIn(t, addr); err != nil {
return err
}
filter := make([]linux.BPFInstruction, int(fprog.Len))
- if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+ if _, err := linux.CopyBPFInstructionSliceIn(t, usermem.Addr(fprog.Filter), filter); err != nil {
return err
}
compiledFilter, err := bpf.Compile(filter)
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 5f54f2456..47dadb800 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -66,7 +67,7 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
ops := make([]linux.Sembuf, nsops)
- if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+ if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
return 0, nil, err
}
@@ -116,8 +117,8 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
case linux.IPC_SET:
arg := args[3].Pointer()
- s := linux.SemidDS{}
- if _, err := t.CopyIn(arg, &s); err != nil {
+ var s linux.SemidDS
+ if _, err := s.CopyIn(t, arg); err != nil {
return 0, nil, err
}
@@ -188,7 +189,7 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
return syserror.EINVAL
}
vals := make([]uint16, set.Size())
- if _, err := t.CopyIn(array, vals); err != nil {
+ if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil {
return err
}
creds := auth.CredentialsFromContext(t)
@@ -217,7 +218,7 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
if err != nil {
return err
}
- _, err = t.CopyOut(array, vals)
+ _, err = primitive.CopyUint16SliceOut(t, array, vals)
return err
}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index f0ae8fa8e..584064143 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -112,18 +112,18 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
stat, err := segment.IPCStat(t)
if err == nil {
- _, err = t.CopyOut(buf, stat)
+ _, err = stat.CopyOut(t, buf)
}
return 0, nil, err
case linux.IPC_INFO:
params := r.IPCInfo()
- _, err := t.CopyOut(buf, params)
+ _, err := params.CopyOut(t, buf)
return 0, nil, err
case linux.SHM_INFO:
info := r.ShmInfo()
- _, err := t.CopyOut(buf, info)
+ _, err := info.CopyOut(t, buf)
return 0, nil, err
}
@@ -137,11 +137,10 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
switch cmd {
case linux.IPC_SET:
var ds linux.ShmidDS
- _, err = t.CopyIn(buf, &ds)
- if err != nil {
+ if _, err = ds.CopyIn(t, buf); err != nil {
return 0, nil, err
}
- err = segment.Set(t, &ds)
+ err := segment.Set(t, &ds)
return 0, nil, err
case linux.IPC_RMID:
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 38f573c14..9feaca0da 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,8 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// LINT.IfChange
@@ -67,10 +67,10 @@ const flagsOffset = 48
const sizeOfInt32 = 4
// messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
// baseRecvFlags are the flags that are accepted across recvmsg(2),
// recvmmsg(2), and recvfrom(2).
@@ -78,6 +78,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
// MessageHeader64 is the 64-bit representation of the msghdr struct used in
// the recvmsg and sendmsg syscalls.
+//
+// +marshal
type MessageHeader64 struct {
// Name is the optional pointer to a network address buffer.
Name uint64
@@ -106,30 +108,14 @@ type MessageHeader64 struct {
// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
// the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
type multipleMessageHeader64 struct {
msgHdr MessageHeader64
msgLen uint32
_ int32
}
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
- b := t.CopyScratchBuffer(52)
- if _, err := t.CopyInBytes(addr, b); err != nil {
- return err
- }
-
- msg.Name = usermem.ByteOrder.Uint64(b[0:])
- msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
- msg.Iov = usermem.ByteOrder.Uint64(b[16:])
- msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
- msg.Control = usermem.ByteOrder.Uint64(b[32:])
- msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
- msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
- return nil
-}
-
// CaptureAddress allocates memory for and copies a socket address structure
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -148,10 +134,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
// writeAddress writes a sockaddr structure and its length to an output buffer
// in the unstrusted address space range. If the address is bigger than the
// buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
// Get the buffer length.
var bufLen uint32
- if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
return err
}
@@ -160,7 +146,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Write the length unconditionally.
- if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
return err
}
@@ -173,7 +159,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Copy as much of the address as will fit in the buffer.
- encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+ addr.MarshalUnsafe(encodedAddr)
if bufLen > uint32(len(encodedAddr)) {
bufLen = uint32(len(encodedAddr))
}
@@ -247,9 +234,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Copy the file descriptors out.
- if _, err := t.CopyOut(socks, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, socks, fds); err != nil {
for _, fd := range fds {
- if file, _ := t.FDTable().Remove(fd); file != nil {
+ if file, _ := t.FDTable().Remove(t, fd); file != nil {
file.DecRef(t)
}
}
@@ -456,8 +443,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Read the length. Reject negative values.
- optLen := int32(0)
- if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ var optLen int32
+ if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
return 0, nil, err
}
if optLen < 0 {
@@ -471,7 +458,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
vLen := int32(binary.Size(v))
- if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
return 0, nil, err
}
@@ -733,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -748,7 +735,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
// Capture the message header and io vectors.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -780,7 +767,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
if int(msg.Flags) != mflags {
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
}
@@ -817,17 +804,17 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
// Copy the control data to the caller.
- if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
return 0, err
}
if len(controlData) > 0 {
- if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
@@ -996,7 +983,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -1011,7 +998,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
// Capture the message header.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -1022,7 +1009,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
return 0, syserror.ENOBUFS
}
controlData = make([]byte, msg.ControlLen)
- if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index c69941feb..46616c961 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -141,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Copy in the offset.
var offset int64
- if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, offsetAddr, &offset); err != nil {
return 0, nil, err
}
@@ -149,11 +150,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
Length: count,
SrcOffset: true,
- SrcStart: offset,
+ SrcStart: int64(offset),
}, outFile.Flags().NonBlocking)
// Copy out the new offset.
- if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, offsetAddr, offset+n); err != nil {
return 0, nil, err
}
} else {
@@ -228,7 +229,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
var offset int64
- if _, err := t.CopyIn(outOffset, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, outOffset, &offset); err != nil {
return 0, nil, err
}
@@ -246,7 +247,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
var offset int64
- if _, err := t.CopyIn(inOffset, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, inOffset, &offset); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index a5826f2dd..cda29a8b5 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -221,7 +221,7 @@ func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr
DevMajor: uint32(devMajor),
DevMinor: devMinor,
}
- _, err := t.CopyOut(statxAddr, &s)
+ _, err := s.CopyOut(t, statxAddr)
return err
}
@@ -283,7 +283,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
FragmentSize: d.Inode.StableAttr.BlockSize,
// Leave other fields 0 like simple_statfs does.
}
- _, err = t.CopyOut(addr, &statfs)
+ _, err = statfs.CopyOut(t, addr)
return err
}
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 297de052a..674d341b6 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -43,6 +43,6 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
FreeRAM: memFree,
Unit: 1,
}
- _, err := t.CopyOut(addr, si)
+ _, err := si.CopyOut(t, addr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 101096038..39ca9ea97 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -19,6 +19,7 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -311,13 +312,13 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
return 0, err
}
if statusAddr != 0 {
- if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+ if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil {
return 0, err
}
}
if rusageAddr != 0 {
ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
- if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ if _, err := ru.CopyOut(t, rusageAddr); err != nil {
return 0, err
}
}
@@ -395,14 +396,14 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// as well.
if infop != 0 {
var si arch.SignalInfo
- _, err = t.CopyOut(infop, &si)
+ _, err = si.CopyOut(t, infop)
}
}
return 0, nil, err
}
if rusageAddr != 0 {
ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
- if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ if _, err := ru.CopyOut(t, rusageAddr); err != nil {
return 0, nil, err
}
}
@@ -441,7 +442,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
default:
t.Warningf("waitid got incomprehensible wait status %d", s)
}
- _, err = t.CopyOut(infop, &si)
+ _, err = si.CopyOut(t, infop)
return 0, nil, err
}
@@ -558,9 +559,7 @@ func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// third argument to this system call is nowadays unused.
if cpu != 0 {
- buf := t.CopyScratchBuffer(4)
- usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
- if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+ if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
return 0, nil, err
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index a2a24a027..c5054d2f1 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -19,6 +19,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -168,7 +169,7 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
return uintptr(r), nil, nil
}
- if _, err := t.CopyOut(addr, r); err != nil {
+ if _, err := r.CopyOut(t, addr); err != nil {
return 0, nil, err
}
return uintptr(r), nil, nil
@@ -334,8 +335,8 @@ func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
// Ask the time package for the timezone.
_, offset := time.Now().Zone()
// This int32 array mimics linux's struct timezone.
- timezone := [2]int32{-int32(offset) / 60, 0}
- _, err := t.CopyOut(tz, timezone)
+ timezone := []int32{-int32(offset) / 60, 0}
+ _, err := primitive.CopyInt32SliceOut(t, tz, timezone)
return 0, nil, err
}
return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index a4c400f87..45eef4feb 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -21,81 +21,63 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
const nsecPerSec = int64(time.Second)
-// copyItimerValIn copies an ItimerVal from the untrusted app range to the
-// kernel. The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed because because Linux allows
-// setitimer(which, NULL, &old_value) which disables the timer.
-// There is a KERN_WARN message saying this misfeature will be removed.
-// However, that hasn't happened as of 3.19, so we continue to support it.
-func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
- if addr == usermem.Addr(0) {
- return linux.ItimerVal{}, nil
- }
-
- switch t.Arch().Width() {
- case 8:
- // Native size, just copy directly.
- var itv linux.ItimerVal
- if _, err := t.CopyIn(addr, &itv); err != nil {
- return linux.ItimerVal{}, err
- }
-
- return itv, nil
- default:
- return linux.ItimerVal{}, syserror.ENOSYS
- }
-}
-
-// copyItimerValOut copies an ItimerVal to the untrusted app range.
-// The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed, in which case no copy takes place
-func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
- if addr == usermem.Addr(0) {
- return nil
- }
-
- switch t.Arch().Width() {
- case 8:
- // Native size, just copy directly.
- _, err := t.CopyOut(addr, itv)
- return err
- default:
- return syserror.ENOSYS
- }
-}
-
// Getitimer implements linux syscall getitimer(2).
func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ if t.Arch().Width() != 8 {
+ // Definition of linux.ItimerVal assumes 64-bit architecture.
+ return 0, nil, syserror.ENOSYS
+ }
+
timerID := args[0].Int()
- val := args[1].Pointer()
+ addr := args[1].Pointer()
olditv, err := t.Getitimer(timerID)
if err != nil {
return 0, nil, err
}
- return 0, nil, copyItimerValOut(t, val, &olditv)
+ // A NULL address is allowed, in which case no copy out takes place.
+ if addr == 0 {
+ return 0, nil, nil
+ }
+ _, err = olditv.CopyOut(t, addr)
+ return 0, nil, err
}
// Setitimer implements linux syscall setitimer(2).
func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- timerID := args[0].Int()
- newVal := args[1].Pointer()
- oldVal := args[2].Pointer()
+ if t.Arch().Width() != 8 {
+ // Definition of linux.ItimerVal assumes 64-bit architecture.
+ return 0, nil, syserror.ENOSYS
+ }
- newitv, err := copyItimerValIn(t, newVal)
- if err != nil {
- return 0, nil, err
+ timerID := args[0].Int()
+ newAddr := args[1].Pointer()
+ oldAddr := args[2].Pointer()
+
+ var newitv linux.ItimerVal
+ // A NULL address is allowed because because Linux allows
+ // setitimer(which, NULL, &old_value) which disables the timer. There is a
+ // KERN_WARN message saying this misfeature will be removed. However, that
+ // hasn't happened as of 3.19, so we continue to support it.
+ if newAddr != 0 {
+ if _, err := newitv.CopyIn(t, newAddr); err != nil {
+ return 0, nil, err
+ }
}
olditv, err := t.Setitimer(timerID, newitv)
if err != nil {
return 0, nil, err
}
- return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+ // A NULL address is allowed, in which case no copy out takes place.
+ if oldAddr == 0 {
+ return 0, nil, nil
+ }
+ _, err = olditv.CopyOut(t, oldAddr)
+ return 0, nil, err
}
// Alarm implements linux syscall alarm(2).
@@ -131,7 +113,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
var sev *linux.Sigevent
if sevp != 0 {
sev = &linux.Sigevent{}
- if _, err = t.CopyIn(sevp, sev); err != nil {
+ if _, err = sev.CopyIn(t, sevp); err != nil {
return 0, nil, err
}
}
@@ -141,7 +123,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, err
}
- if _, err := t.CopyOut(timerIDp, &id); err != nil {
+ if _, err := id.CopyOut(t, timerIDp); err != nil {
t.IntervalTimerDelete(id)
return 0, nil, err
}
@@ -157,7 +139,7 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
oldValAddr := args[3].Pointer()
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
@@ -165,9 +147,8 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
return 0, nil, err
}
if oldValAddr != 0 {
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
- return 0, nil, err
- }
+ _, err = oldVal.CopyOut(t, oldValAddr)
+ return 0, nil, err
}
return 0, nil, nil
}
@@ -181,7 +162,7 @@ func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if err != nil {
return 0, nil, err
}
- _, err = t.CopyOut(curValAddr, &curVal)
+ _, err = curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index 34b03e4ee..cadd9d348 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -81,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
}
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
@@ -91,7 +91,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, oldS := tf.SetTime(newS)
if oldValAddr != 0 {
oldVal := ktime.ItimerspecFromSetting(tm, oldS)
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
return 0, nil, err
}
}
@@ -116,6 +116,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, s := tf.GetTime()
curVal := ktime.ItimerspecFromSetting(tm, s)
- _, err := t.CopyOut(curValAddr, &curVal)
+ _, err := curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
index b3eb96a1c..6ddd30d5c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -18,6 +18,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
@@ -30,17 +31,19 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
case linux.ARCH_GET_FS:
addr := args[1].Pointer()
fsbase := t.Arch().TLS()
- _, err := t.CopyOut(addr, uint64(fsbase))
- if err != nil {
- return 0, nil, err
+ switch t.Arch().Width() {
+ case 8:
+ if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil {
+ return 0, nil, err
+ }
+ default:
+ return 0, nil, syserror.ENOSYS
}
-
case linux.ARCH_SET_FS:
fsbase := args[1].Uint64()
if !t.Arch().SetTLS(uintptr(fsbase)) {
return 0, nil, syserror.EPERM
}
-
case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index e9d702e8e..66c5974f5 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -46,7 +46,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Copy out the result.
va := args[0].Pointer()
- _, err := t.CopyOut(va, u)
+ _, err := u.CopyOut(t, va)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 64696b438..9ee766552 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -44,6 +44,9 @@ go_library(
"//pkg/context",
"//pkg/fspath",
"//pkg/gohacks",
+ "//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
"//pkg/sentry/fsbridge",
@@ -72,7 +75,5 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
- "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
index 42559bf69..6d0a38330 100644
--- a/pkg/sentry/syscalls/linux/vfs2/aio.go
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -17,6 +17,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -38,21 +39,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
for i := int32(0); i < nrEvents; i++ {
- // Copy in the address.
- cbAddrNative := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
- if i > 0 {
- // Some successful.
- return uintptr(i), nil, nil
+ // Copy in the callback address.
+ var cbAddr usermem.Addr
+ switch t.Arch().Width() {
+ case 8:
+ var cbAddrP primitive.Uint64
+ if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
}
- // Nothing done.
- return 0, nil, err
+ cbAddr = usermem.Addr(cbAddrP)
+ default:
+ return 0, nil, syserror.ENOSYS
}
// Copy in this callback.
var cb linux.IOCallback
- cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
- if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+ if _, err := cb.CopyIn(t, cbAddr); err != nil {
if i > 0 {
// Some have been successful.
return uintptr(i), nil, nil
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index c62f03509..d0cbb77eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -24,7 +24,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -141,50 +140,26 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, syserror.EINVAL
}
- // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
- // maxEvents), so that the buffer can be allocated on the stack.
+ // Allocate space for a few events on the stack for the common case in
+ // which we don't have too many events.
var (
- events [16]linux.EpollEvent
- total int
+ eventsArr [16]linux.EpollEvent
ch chan struct{}
haveDeadline bool
deadline ktime.Time
)
for {
- batchEvents := len(events)
- if batchEvents > maxEvents {
- batchEvents = maxEvents
- }
- n := ep.ReadEvents(events[:batchEvents])
- maxEvents -= n
- if n != 0 {
- // Copy what we read out.
- copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+ events := ep.ReadEvents(eventsArr[:0], maxEvents)
+ if len(events) != 0 {
+ copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
- eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
- total += copiedEvents
- if err != nil {
- if total != 0 {
- return uintptr(total), nil, nil
- }
- return 0, nil, err
- }
- // If we've filled the application's event buffer, we're done.
- if maxEvents == 0 {
- return uintptr(total), nil, nil
- }
- // Loop if we read a full batch, under the expectation that there
- // may be more events to read.
- if n == batchEvents {
- continue
+ if copiedEvents != 0 {
+ return uintptr(copiedEvents), nil, nil
}
+ return 0, nil, err
}
- // We get here if n != batchEvents. If we read any number of events
- // (just now, or in a previous iteration of this loop), or if timeout
- // is 0 (such that epoll_wait should be non-blocking), return the
- // events we've read so far to the application.
- if total != 0 || timeout == 0 {
- return uintptr(total), nil, nil
+ if timeout == 0 {
+ return 0, nil, nil
}
// In the first iteration of this loop, register with the epoll
// instance for readability events, but then immediately continue the
@@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if err == syserror.ETIMEDOUT {
err = nil
}
- // total must be 0 since otherwise we would have returned
- // above.
return 0, nil, err
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 4856554fe..d8b8d9783 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -34,7 +34,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- _, file := t.FDTable().Remove(fd)
+ _, file := t.FDTable().Remove(t, fd)
if file == nil {
return 0, nil, syserror.EBADF
}
@@ -137,7 +137,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
return 0, nil, err
@@ -181,11 +181,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if !hasOwner {
return 0, nil, nil
}
- _, err := t.CopyOut(args[2].Pointer(), &owner)
+ _, err := owner.CopyOut(t, args[2].Pointer())
return 0, nil, err
case linux.F_SETOWN_EX:
var owner linux.FOwnerEx
- _, err := t.CopyIn(args[2].Pointer(), &owner)
+ _, err := owner.CopyIn(t, args[2].Pointer())
if err != nil {
return 0, nil, err
}
@@ -286,7 +286,7 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip
// Copy in the lock request.
flockAddr := args[2].Pointer()
var flock linux.Flock
- if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ if _, err := flock.CopyIn(t, flockAddr); err != nil {
return err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 38778a388..2806c3f6f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -16,6 +16,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
@@ -34,20 +35,20 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Handle ioctls that apply to all FDs.
switch args[1].Int() {
case linux.FIONCLEX:
- t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
- t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
case linux.FIONBIO:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.StatusFlags()
@@ -60,7 +61,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FIOASYNC:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.StatusFlags()
@@ -82,12 +83,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
who = owner.PID
}
}
- _, err := t.CopyOut(args[2].Pointer(), &who)
+ _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
return 0, nil, err
case linux.FIOSETOWN, linux.SIOCSPGRP:
var who int32
- if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
return 0, nil, err
}
ownerType := int32(linux.F_OWNER_PID)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index dc05c2994..9d9dbf775 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -17,6 +17,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/syserror"
@@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with an anonymous tmpfs file.
+ opts.Offset = 0
+ file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef(t)
+ if err := file.ConfigureMMap(t, &opts); err != nil {
+ return 0, nil, err
+ }
}
rv, err := t.MemoryManager().MMap(t, opts)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 9b4848d9e..ee38fdca0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -16,6 +16,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -51,9 +52,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
if err != nil {
return err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if _, file := t.FDTable().Remove(fd); file != nil {
+ if _, file := t.FDTable().Remove(t, fd); file != nil {
file.DecRef(t)
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index 79ad64039..c22e4ce54 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -165,7 +165,7 @@ func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
pfd := make([]linux.PollFD, nfds)
if nfds > 0 {
- if _, err := t.CopyIn(addr, &pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
return nil, err
}
}
@@ -192,7 +192,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
- if _, err := t.CopyOut(addr, pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
@@ -205,7 +205,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
set := make([]byte, nBytes)
if addr != 0 {
- if _, err := t.CopyIn(addr, &set); err != nil {
+ if _, err := t.CopyInBytes(addr, set); err != nil {
return nil, err
}
// If we only use part of the last byte, mask out the extraneous bits.
@@ -332,19 +332,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// Copy updated vectors back.
if readFDs != 0 {
- if _, err := t.CopyOut(readFDs, r); err != nil {
+ if _, err := t.CopyOutBytes(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
- if _, err := t.CopyOut(writeFDs, w); err != nil {
+ if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
- if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
return 0, err
}
}
@@ -497,6 +497,12 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return n, nil, err
}
+// +marshal
+type sigSetWithSize struct {
+ sigsetAddr uint64
+ sizeofSigset uint64
+}
+
// Pselect implements linux syscall pselect(2).
func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
@@ -538,12 +544,6 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
return n, nil, err
}
-// +marshal
-type sigSetWithSize struct {
- sigsetAddr uint64
- sizeofSigset uint64
-}
-
// copyTimespecInToDuration copies a Timespec from the untrusted app range,
// validates it and converts it to a Duration.
//
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 5e6eb13ba..1ee37e5a8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -346,7 +346,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opt
return nil
}
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
@@ -410,7 +410,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
return nil
}
var times [2]linux.Timespec
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Nsec != linux.UTIME_OMIT {
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a5032657a..bfae6b7e9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -30,8 +32,6 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
- "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -66,10 +66,10 @@ const flagsOffset = 48
const sizeOfInt32 = 4
// messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
// baseRecvFlags are the flags that are accepted across recvmsg(2),
// recvmmsg(2), and recvfrom(2).
@@ -77,6 +77,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
// MessageHeader64 is the 64-bit representation of the msghdr struct used in
// the recvmsg and sendmsg syscalls.
+//
+// +marshal
type MessageHeader64 struct {
// Name is the optional pointer to a network address buffer.
Name uint64
@@ -105,30 +107,14 @@ type MessageHeader64 struct {
// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
// the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
type multipleMessageHeader64 struct {
msgHdr MessageHeader64
msgLen uint32
_ int32
}
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
- b := t.CopyScratchBuffer(52)
- if _, err := t.CopyInBytes(addr, b); err != nil {
- return err
- }
-
- msg.Name = usermem.ByteOrder.Uint64(b[0:])
- msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
- msg.Iov = usermem.ByteOrder.Uint64(b[16:])
- msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
- msg.Control = usermem.ByteOrder.Uint64(b[32:])
- msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
- msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
- return nil
-}
-
// CaptureAddress allocates memory for and copies a socket address structure
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -147,10 +133,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
// writeAddress writes a sockaddr structure and its length to an output buffer
// in the unstrusted address space range. If the address is bigger than the
// buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
// Get the buffer length.
var bufLen uint32
- if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
return err
}
@@ -159,7 +145,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Write the length unconditionally.
- if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
return err
}
@@ -172,7 +158,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Copy as much of the address as will fit in the buffer.
- encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+ addr.MarshalUnsafe(encodedAddr)
if bufLen > uint32(len(encodedAddr)) {
bufLen = uint32(len(encodedAddr))
}
@@ -250,9 +237,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if _, file := t.FDTable().Remove(fd); file != nil {
+ if _, file := t.FDTable().Remove(t, fd); file != nil {
file.DecRef(t)
}
}
@@ -459,8 +446,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Read the length. Reject negative values.
- optLen := int32(0)
- if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ var optLen int32
+ if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
return 0, nil, err
}
if optLen < 0 {
@@ -474,7 +461,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
vLen := int32(binary.Size(v))
- if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
return 0, nil, err
}
@@ -736,7 +723,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -751,7 +738,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
// Capture the message header and io vectors.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -783,7 +770,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
if int(msg.Flags) != mflags {
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
}
@@ -820,17 +807,17 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
}
// Copy the control data to the caller.
- if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
return 0, err
}
if len(controlData) > 0 {
- if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
@@ -999,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -1014,7 +1001,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
// Capture the message header.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -1025,7 +1012,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
return 0, syserror.ENOBUFS
}
controlData = make([]byte, msg.ControlLen)
- if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 68ce94778..bf5c1171f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -18,9 +18,12 @@ import (
"io"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -88,7 +91,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if inFile.Options().DenyPRead {
return 0, nil, syserror.EINVAL
}
- if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+ if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
return 0, nil, err
}
if inOffset < 0 {
@@ -103,7 +106,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if outFile.Options().DenyPWrite {
return 0, nil, syserror.EINVAL
}
- if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+ if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
return 0, nil, err
}
if outOffset < 0 {
@@ -144,11 +147,6 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
panic("at least one end of splice must be a pipe")
}
- if n == 0 && err == io.EOF {
- // We reached the end of the file. Eat the error and exit the loop.
- err = nil
- break
- }
if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
break
}
@@ -159,25 +157,26 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Copy updated offsets out.
if inOffsetPtr != 0 {
- if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
return 0, nil, err
}
}
if outOffsetPtr != 0 {
- if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
return 0, nil, err
}
}
- if n == 0 {
- return 0, nil, err
+ if n != 0 {
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
- // On Linux, inotify behavior is not very consistent with splice(2). We try
- // our best to emulate Linux for very basic calls to splice, where for some
- // reason, events are generated for output files, but not input files.
- outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
- return uintptr(n), nil, nil
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
}
// Tee implements Linux syscall tee(2).
@@ -249,11 +248,20 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
break
}
}
- if n == 0 {
- return 0, nil, err
+
+ if n != 0 {
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+ // If a partial write is completed, the error is dropped. Log it here.
+ if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+ log.Debugf("tee completed a partial write with error: %v", err)
+ err = nil
+ }
}
- outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
- return uintptr(n), nil, nil
+
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
}
// Sendfile implements linux system call sendfile(2).
@@ -302,9 +310,12 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if inFile.Options().DenyPRead {
return 0, nil, syserror.ESPIPE
}
- if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+ var offsetP primitive.Int64
+ if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
return 0, nil, err
}
+ offset = int64(offsetP)
+
if offset < 0 {
return 0, nil, syserror.EINVAL
}
@@ -343,11 +354,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
for n < count {
var spliceN int64
spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
- if spliceN == 0 && err == io.EOF {
- // We reached the end of the file. Eat the error and exit the loop.
- err = nil
- break
- }
if offset != -1 {
offset += spliceN
}
@@ -370,13 +376,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
} else {
readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
}
- if readN == 0 && err != nil {
- if err == io.EOF {
- // We reached the end of the file. Eat the error before exiting the loop.
- err = nil
- }
- break
- }
n += readN
// Write all of the bytes that we read. This may need
@@ -390,16 +389,21 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
err = dw.waitForOut(t)
}
if err != nil {
- // We didn't complete the write. Only
- // report the bytes that were actually
- // written, and rewind the offset.
+ // We didn't complete the write. Only report the bytes that were actually
+ // written, and rewind offsets as needed.
notWritten := int64(len(wbuf))
n -= notWritten
- if offset != -1 {
- // TODO(gvisor.dev/issue/3779): The inFile offset will be incorrect if we
- // roll back, because it has already been advanced by the full amount.
- // Merely seeking on inFile does not work, because there may be concurrent
- // file operations.
+ if offset == -1 {
+ // We modified the offset of the input file itself during the read
+ // operation. Rewind it.
+ if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
+ // Log the error but don't return it, since the write has already
+ // completed successfully.
+ log.Warningf("failed to roll back input file offset: %v", seekErr)
+ }
+ } else {
+ // The sendfile call was provided an offset parameter that should be
+ // adjusted to reflect the number of bytes sent. Rewind it.
offset -= notWritten
}
break
@@ -416,18 +420,26 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if offsetAddr != 0 {
// Copy out the new offset.
- if _, err := t.CopyOut(offsetAddr, offset); err != nil {
+ offsetP := primitive.Uint64(offset)
+ if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
return 0, nil, err
}
}
- if n == 0 {
- return 0, nil, err
+ if n != 0 {
+ inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+ if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+ // If a partial write is completed, the error is dropped. Log it here.
+ log.Debugf("sendfile completed a partial write with error: %v", err)
+ err = nil
+ }
}
- inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
- outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
- return uintptr(n), nil, nil
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
}
// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
index 7a26890ef..250870c03 100644
--- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -87,7 +87,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
}
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
@@ -97,7 +97,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, oldS := tfd.SetTime(newS)
if oldValAddr != 0 {
oldVal := ktime.ItimerspecFromSetting(tm, oldS)
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
return 0, nil, err
}
}
@@ -122,6 +122,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, s := tfd.GetTime()
curVal := ktime.ItimerspecFromSetting(tm, s)
- _, err := t.CopyOut(curValAddr, &curVal)
+ _, err := curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index c576d9475..0df3bd449 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -93,16 +93,16 @@ func Override() {
s.Table[165] = syscalls.Supported("mount", Mount)
s.Table[166] = syscalls.Supported("umount2", Umount2)
s.Table[187] = syscalls.Supported("readahead", Readahead)
- s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+ s.Table[188] = syscalls.Supported("setxattr", SetXattr)
s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
- s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+ s.Table[191] = syscalls.Supported("getxattr", GetXattr)
s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
- s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+ s.Table[194] = syscalls.Supported("listxattr", ListXattr)
s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
- s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+ s.Table[197] = syscalls.Supported("removexattr", RemoveXattr)
s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
@@ -163,16 +163,16 @@ func Override() {
// Override ARM64.
s = linux.ARM64
- s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+ s.Table[5] = syscalls.Supported("setxattr", SetXattr)
s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
- s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+ s.Table[8] = syscalls.Supported("getxattr", GetXattr)
s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
- s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+ s.Table[11] = syscalls.Supported("listxattr", ListXattr)
s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
- s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+ s.Table[14] = syscalls.Supported("removexattr", RemoveXattr)
s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
s.Table[17] = syscalls.Supported("getcwd", Getcwd)
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index ef99246ed..e05723ef9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -26,8 +26,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// Listxattr implements Linux syscall listxattr(2).
-func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// ListXattr implements Linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return listxattr(t, args, followFinalSymlink)
}
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
}
defer tpop.Release(t)
- names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+ names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
if err != nil {
return 0, nil, err
}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
defer file.DecRef(t)
- names, err := file.Listxattr(t, uint64(size))
+ names, err := file.ListXattr(t, uint64(size))
if err != nil {
return 0, nil, err
}
@@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return uintptr(n), nil, nil
}
-// Getxattr implements Linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements Linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getxattr(t, args, followFinalSymlink)
}
@@ -116,7 +116,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
return 0, nil, err
}
- value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+ value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
Name: name,
Size: uint64(size),
})
@@ -148,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, err
}
- value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+ value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
if err != nil {
return 0, nil, err
}
@@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return uintptr(n), nil, nil
}
-// Setxattr implements Linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements Linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, setxattr(t, args, followFinalSymlink)
}
@@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
return err
}
- return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+ return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
@@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, err
}
- return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+ return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
})
}
-// Removexattr implements Linux syscall removexattr(2).
-func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// RemoveXattr implements Linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, removexattr(t, args, followFinalSymlink)
}
@@ -269,7 +269,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
return err
}
- return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+ return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
}
// Fremovexattr implements Linux syscall fremovexattr(2).
@@ -288,7 +288,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
return 0, nil, err
}
- return 0, nil, file.Removexattr(t, name)
+ return 0, nil, file.RemoveXattr(t, name)
}
func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 5a0e3e6b5..bdfd3ca8f 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -52,6 +52,8 @@ const (
)
// anonFilesystemType implements FilesystemType.
+//
+// +stateify savable
type anonFilesystemType struct{}
// GetFilesystem implements FilesystemType.GetFilesystem.
@@ -69,12 +71,15 @@ func (anonFilesystemType) Name() string {
//
// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
// methods that would require an anonDentry to be a directory return ENOTDIR.
+//
+// +stateify savable
type anonFilesystem struct {
vfsfs Filesystem
devMinor uint32
}
+// +stateify savable
type anonDentry struct {
vfsd Dentry
@@ -245,32 +250,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements FilesystemImpl.ListXattrAt.
+func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
if !rp.Done() {
return nil, syserror.ENOTDIR
}
return nil, nil
}
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+// GetXattrAt implements FilesystemImpl.GetXattrAt.
+func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
if !rp.Done() {
return "", syserror.ENOTDIR
}
return "", syserror.ENOTSUP
}
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+// SetXattrAt implements FilesystemImpl.SetXattrAt.
+func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
if !rp.Done() {
return syserror.ENOTDIR
}
return syserror.EPERM
}
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
+func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
if !rp.Done() {
return syserror.ENOTDIR
}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index a69a5b2f1..320ab7ce1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -89,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl {
// DentryImpl contains implementation details for a Dentry. Implementations of
// DentryImpl should contain their associated Dentry by value as their first
// field.
+//
+// +stateify savable
type DentryImpl interface {
// IncRef increments the Dentry's reference count. A Dentry with a non-zero
// reference count must remain coherent with the state of the filesystem.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 1e9dffc8f..dde2ad79b 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -22,6 +22,8 @@ import (
)
// DeviceKind indicates whether a device is a block or character device.
+//
+// +stateify savable
type DeviceKind uint32
const (
@@ -44,6 +46,7 @@ func (kind DeviceKind) String() string {
}
}
+// +stateify savable
type devTuple struct {
kind DeviceKind
major uint32
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 1b5af9f73..8f36c3e3b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -27,6 +27,8 @@ import (
var epollCycleMu sync.Mutex
// EpollInstance represents an epoll instance, as described by epoll(7).
+//
+// +stateify savable
type EpollInstance struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
@@ -38,11 +40,11 @@ type EpollInstance struct {
// interest is the set of file descriptors that are registered with the
// EpollInstance for monitoring. interest is protected by interestMu.
- interestMu sync.Mutex
+ interestMu sync.Mutex `state:"nosave"`
interest map[epollInterestKey]*epollInterest
// mu protects fields in registered epollInterests.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// ready is the set of file descriptors that may be "ready" for I/O. Note
// that this must be an ordered list, not a map: "If more than maxevents
@@ -55,6 +57,7 @@ type EpollInstance struct {
ready epollInterestList
}
+// +stateify savable
type epollInterestKey struct {
// file is the registered FileDescription. No reference is held on file;
// instead, when the last reference is dropped, FileDescription.DecRef()
@@ -67,6 +70,8 @@ type epollInterestKey struct {
}
// epollInterest represents an EpollInstance's interest in a file descriptor.
+//
+// +stateify savable
type epollInterest struct {
// epoll is the owning EpollInstance. epoll is immutable.
epoll *EpollInstance
@@ -331,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) {
ep.mu.Unlock()
}
-// ReadEvents reads up to len(events) ready events into events and returns the
-// number of events read.
-//
-// Preconditions: len(events) != 0.
-func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+// ReadEvents appends up to maxReady events to events and returns the updated
+// slice of events.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
i := 0
// Hot path: avoid defer.
ep.mu.Lock()
@@ -368,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
requeue.PushBack(epi)
}
// Report ievents.
- events[i] = linux.EpollEvent{
+ events = append(events, linux.EpollEvent{
Events: ievents.ToLinux(),
Data: epi.userData,
- }
+ })
i++
- if i == len(events) {
+ if i == maxEvents {
break
}
}
ep.ready.PushBackList(&requeue)
ep.mu.Unlock()
- return i
+ return events
}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 22a54fa48..1eba0270f 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -37,11 +37,13 @@ import (
// FileDescription methods require that a reference is held.
//
// FileDescription is analogous to Linux's struct file.
+//
+// +stateify savable
type FileDescription struct {
FileDescriptionRefs
// flagsMu protects statusFlags and asyncHandler below.
- flagsMu sync.Mutex
+ flagsMu sync.Mutex `state:"nosave"`
// statusFlags contains status flags, "initialized by open(2) and possibly
// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
@@ -56,7 +58,7 @@ type FileDescription struct {
// epolls is the set of epollInterests registered for this FileDescription.
// epolls is protected by epollMu.
- epollMu sync.Mutex
+ epollMu sync.Mutex `state:"nosave"`
epolls map[*epollInterest]struct{}
// vd is the filesystem location at which this FileDescription was opened.
@@ -88,6 +90,8 @@ type FileDescription struct {
}
// FileDescriptionOptions contains options to FileDescription.Init().
+//
+// +stateify savable
type FileDescriptionOptions struct {
// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
AllowDirectIO bool
@@ -101,7 +105,7 @@ type FileDescriptionOptions struct {
// If UseDentryMetadata is true, calls to FileDescription methods that
// interact with file and filesystem metadata (Stat, SetStat, StatFS,
- // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+ // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
// the corresponding FilesystemImpl methods instead of the corresponding
// FileDescriptionImpl methods.
//
@@ -326,6 +330,9 @@ type FileDescriptionImpl interface {
// Allocate grows the file to offset + length bytes.
// Only mode == 0 is supported currently.
//
+ // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
+ // other files where it is not supported.
+ //
// Preconditions: The FileDescription was opened for writing.
Allocate(ctx context.Context, mode, offset, length uint64) error
@@ -420,19 +427,19 @@ type FileDescriptionImpl interface {
// Ioctl implements the ioctl(2) syscall.
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
- // Listxattr returns all extended attribute names for the file.
- Listxattr(ctx context.Context, size uint64) ([]string, error)
+ // ListXattr returns all extended attribute names for the file.
+ ListXattr(ctx context.Context, size uint64) ([]string, error)
- // Getxattr returns the value associated with the given extended attribute
+ // GetXattr returns the value associated with the given extended attribute
// for the file.
- Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+ GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
- // Setxattr changes the value associated with the given extended attribute
+ // SetXattr changes the value associated with the given extended attribute
// for the file.
- Setxattr(ctx context.Context, opts SetxattrOptions) error
+ SetXattr(ctx context.Context, opts SetXattrOptions) error
- // Removexattr removes the given extended attribute from the file.
- Removexattr(ctx context.Context, name string) error
+ // RemoveXattr removes the given extended attribute from the file.
+ RemoveXattr(ctx context.Context, name string) error
// LockBSD tries to acquire a BSD-style advisory file lock.
LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
@@ -448,6 +455,8 @@ type FileDescriptionImpl interface {
}
// Dirent holds the information contained in struct linux_dirent64.
+//
+// +stateify savable
type Dirent struct {
// Name is the filename.
Name string
@@ -635,25 +644,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
return fd.impl.Ioctl(ctx, uio, args)
}
-// Listxattr returns all extended attribute names for the file represented by
+// ListXattr returns all extended attribute names for the file represented by
// fd.
//
// If the size of the list (including a NUL terminating byte after every entry)
// would exceed size, ERANGE may be returned. Note that implementations
// are free to ignore size entirely and return without error). In all cases,
// if size is 0, the list should be returned without error, regardless of size.
-func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+ names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
vfsObj.putResolvingPath(ctx, rp)
return names, err
}
- names, err := fd.impl.Listxattr(ctx, size)
+ names, err := fd.impl.ListXattr(ctx, size)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -664,57 +673,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
return names, err
}
-// Getxattr returns the value associated with the given extended attribute for
+// GetXattr returns the value associated with the given extended attribute for
// the file represented by fd.
//
// If the size of the return value exceeds opts.Size, ERANGE may be returned
// (note that implementations are free to ignore opts.Size entirely and return
// without error). In all cases, if opts.Size is 0, the value should be
// returned without error, regardless of size.
-func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+ val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(ctx, rp)
return val, err
}
- return fd.impl.Getxattr(ctx, *opts)
+ return fd.impl.GetXattr(ctx, *opts)
}
-// Setxattr changes the value associated with the given extended attribute for
+// SetXattr changes the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(ctx, rp)
return err
}
- return fd.impl.Setxattr(ctx, *opts)
+ return fd.impl.SetXattr(ctx, *opts)
}
-// Removexattr removes the given extended attribute from the file represented
+// RemoveXattr removes the given extended attribute from the file represented
// by fd.
-func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
vfsObj.putResolvingPath(ctx, rp)
return err
}
- return fd.impl.Removexattr(ctx, name)
+ return fd.impl.RemoveXattr(ctx, name)
}
// SyncFS instructs the filesystem containing fd to execute the semantics of
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 6b8b4ad49..48ca9de44 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -42,6 +42,8 @@ import (
// FileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
// methods with default behavior analogous to Linux's.
+//
+// +stateify savable
type FileDescriptionDefaultImpl struct{}
// OnClose implements FileDescriptionImpl.OnClose analogously to
@@ -57,7 +59,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
}
// Allocate implements FileDescriptionImpl.Allocate analogously to
-// fallocate called on regular file, directory or FIFO in Linux.
+// fallocate called on an invalid type of file in Linux.
+//
+// Note that directories can rely on this implementation even though they
+// should technically return EISDIR. Allocate should never be called for a
+// directory, because it requires a writable fd.
func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
return syserror.ENODEV
}
@@ -134,34 +140,36 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
return 0, syserror.ENOTTY
}
-// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// ListXattr implements FileDescriptionImpl.ListXattr analogously to
// inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- // This isn't exactly accurate; see FileDescription.Listxattr.
+func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ // This isn't exactly accurate; see FileDescription.ListXattr.
return nil, syserror.ENOTSUP
}
-// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// GetXattr implements FileDescriptionImpl.GetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
return "", syserror.ENOTSUP
}
-// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// SetXattr implements FileDescriptionImpl.SetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
return syserror.ENOTSUP
}
-// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
return syserror.ENOTSUP
}
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
+//
+// +stateify savable
type DirectoryFileDescriptionDefaultImpl struct{}
// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
@@ -192,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
// DentryMetadataFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
// true to obtain implementations of Stat and SetStat that panic.
+//
+// +stateify savable
type DentryMetadataFileDescriptionImpl struct{}
// Stat implements FileDescriptionImpl.Stat.
@@ -206,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
// DynamicBytesSource represents a data source for a
// DynamicBytesFileDescriptionImpl.
+//
+// +stateify savable
type DynamicBytesSource interface {
// Generate writes the file's contents to buf.
Generate(ctx context.Context, buf *bytes.Buffer) error
}
// StaticData implements DynamicBytesSource over a static string.
+//
+// +stateify savable
type StaticData struct {
Data string
}
@@ -238,14 +252,24 @@ type WritableDynamicBytesSource interface {
//
// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
// use.
+//
+// +stateify savable
type DynamicBytesFileDescriptionImpl struct {
data DynamicBytesSource // immutable
- mu sync.Mutex // protects the following fields
- buf bytes.Buffer
+ mu sync.Mutex `state:"nosave"` // protects the following fields
+ buf bytes.Buffer `state:".([]byte)"`
off int64
lastRead int64 // offset at which the last Read, PRead, or Seek ended
}
+func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
+ return fd.buf.Bytes()
+}
+
+func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
+ fd.buf.Write(p)
+}
+
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -378,6 +402,8 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
// functions. Caller must call Init().
+//
+// +stateify savable
type LockFD struct {
locks *FileLocks
}
@@ -405,6 +431,8 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
// returning ENOLCK.
+//
+// +stateify savable
type NoLockFD struct{}
// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 46851f638..c93d94634 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -416,26 +416,26 @@ type FilesystemImpl interface {
// ResolvingPath.Resolve*(), then !rp.Done().
UnlinkAt(ctx context.Context, rp *ResolvingPath) error
- // ListxattrAt returns all extended attribute names for the file at rp.
+ // ListXattrAt returns all extended attribute names for the file at rp.
//
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // ListxattrAt returns ENOTSUP.
+ // ListXattrAt returns ENOTSUP.
//
// - If the size of the list (including a NUL terminating byte after every
// entry) would exceed size, ERANGE may be returned. Note that
// implementations are free to ignore size entirely and return without
// error). In all cases, if size is 0, the list should be returned without
// error, regardless of size.
- ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+ ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
- // GetxattrAt returns the value associated with the given extended
+ // GetXattrAt returns the value associated with the given extended
// attribute for the file at rp.
//
// Errors:
//
- // - If extended attributes are not supported by the filesystem, GetxattrAt
+ // - If extended attributes are not supported by the filesystem, GetXattrAt
// returns ENOTSUP.
//
// - If an extended attribute named opts.Name does not exist, ENODATA is
@@ -445,30 +445,30 @@ type FilesystemImpl interface {
// returned (note that implementations are free to ignore opts.Size entirely
// and return without error). In all cases, if opts.Size is 0, the value
// should be returned without error, regardless of size.
- GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+ GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)
- // SetxattrAt changes the value associated with the given extended
+ // SetXattrAt changes the value associated with the given extended
// attribute for the file at rp.
//
// Errors:
//
- // - If extended attributes are not supported by the filesystem, SetxattrAt
+ // - If extended attributes are not supported by the filesystem, SetXattrAt
// returns ENOTSUP.
//
// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
// ENODATA is returned.
- SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+ SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error
- // RemovexattrAt removes the given extended attribute from the file at rp.
+ // RemoveXattrAt removes the given extended attribute from the file at rp.
//
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // RemovexattrAt returns ENOTSUP.
+ // RemoveXattrAt returns ENOTSUP.
//
// - If name does not exist, ENODATA is returned.
- RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+ RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error
// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
//
@@ -506,6 +506,8 @@ type FilesystemImpl interface {
// PrependPathAtVFSRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+//
+// +stateify savable
type PrependPathAtVFSRootError struct{}
// Error implements error.Error.
@@ -516,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string {
// PrependPathAtNonMountRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter an independent ancestor
// Dentry that is not the Mount root.
+//
+// +stateify savable
type PrependPathAtNonMountRootError struct{}
// Error implements error.Error.
@@ -526,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string {
// PrependPathSyntheticError is returned by implementations of
// FilesystemImpl.PrependPath() for which prepended names do not represent real
// paths.
+//
+// +stateify savable
type PrependPathSyntheticError struct{}
// Error implements error.Error.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f2298f7f6..bc19db1d5 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -55,10 +55,13 @@ type registeredFilesystemType struct {
// RegisterFilesystemTypeOptions contains options to
// VirtualFilesystem.RegisterFilesystem().
+//
+// +stateify savable
type RegisterFilesystemTypeOptions struct {
- // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
- // for which MountOptions.InternalMount == false to use this filesystem
- // type.
+ // AllowUserMount determines whether users are allowed to mount a file system
+ // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
+ // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false
+ // to use this filesystem type.
AllowUserMount bool
// If AllowUserList is true, make this filesystem type visible in
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 8882fa84a..2d27d9d35 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -27,6 +27,8 @@ import (
)
// Dentry is a required type parameter that is a struct with the given fields.
+//
+// +stateify savable
type Dentry struct {
// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
vfsd vfs.Dentry
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index aff220a61..3f0b8f45b 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16
//
// The way events are labelled appears somewhat arbitrary, but they must match
// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+//
+// +stateify savable
type EventType uint8
// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
index 42666eebf..55783d4eb 100644
--- a/pkg/sentry/vfs/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -33,6 +33,8 @@ import (
// Note that in Linux these two types of locks are _not_ cooperative, because
// race and deadlock conditions make merging them prohibitive. We do the same
// and keep them oblivious to each other.
+//
+// +stateify savable
type FileLocks struct {
// bsd is a set of BSD-style advisory file wide locks, see flock(2).
bsd fslock.Locks
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
index cc1e7d764..638b5d830 100644
--- a/pkg/sentry/vfs/memxattr/xattr.go
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct {
xattrs map[string]string
}
-// Getxattr returns the value at 'name'.
-func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+// GetXattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) {
x.mu.RLock()
value, ok := x.xattrs[opts.Name]
x.mu.RUnlock()
@@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string,
return value, nil
}
-// Setxattr sets 'value' at 'name'.
-func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+// SetXattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error {
x.mu.Lock()
defer x.mu.Unlock()
if x.xattrs == nil {
@@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
return nil
}
-// Listxattr returns all names in xattrs.
-func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+// ListXattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) {
// Keep track of the size of the buffer needed in listxattr(2) for the list.
listSize := 0
x.mu.RLock()
@@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
return names, nil
}
-// Removexattr removes the xattr at 'name'.
-func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+// RemoveXattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) RemoveXattr(name string) error {
x.mu.Lock()
defer x.mu.Unlock()
if _, ok := x.xattrs[name]; !ok {
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index db5fb3bb1..dfc3ae6c0 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,14 +18,12 @@ import (
"bytes"
"fmt"
"math"
- "path"
"sort"
"strings"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -67,7 +65,7 @@ type Mount struct {
//
// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
// key.parent.fs.
- key mountKey
+ key mountKey `state:".(VirtualDentry)"`
// ns is the namespace in which this Mount was mounted. ns is protected by
// VirtualFilesystem.mountMu.
@@ -154,13 +152,13 @@ type MountNamespace struct {
// NewMountNamespace returns a new mount namespace with a root filesystem
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
return nil, syserror.ENODEV
}
- fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
return nil, err
}
@@ -169,7 +167,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
mountpoints: make(map[*Dentry]uint32),
}
mntns.EnableLeakCheck()
- mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+ mntns.root = newMount(vfs, fs, root, mntns, opts)
return mntns, nil
}
@@ -347,6 +345,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
return nil
}
+// +stateify savable
type umountRecursiveOptions struct {
// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
// on umounted mounts fail.
@@ -416,7 +415,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
}
}
mnt.IncRef() // dropped by callers of umountRecursiveLocked
- mnt.storeKey(vd)
+ mnt.setKey(vd)
if vd.mount.children == nil {
vd.mount.children = make(map[*Mount]struct{})
}
@@ -441,13 +440,13 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
// * vfs.mounts.seq must be in a writer critical section.
// * mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
- vd := mnt.loadKey()
+ vd := mnt.getKey()
if checkInvariants {
if vd.mount != nil {
panic("VFS.disconnectLocked called on disconnected mount")
}
}
- mnt.storeKey(VirtualDentry{})
+ mnt.loadKey(VirtualDentry{})
delete(vd.mount.children, mnt)
atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
mnt.ns.mountpoints[vd.dentry]--
@@ -740,11 +739,23 @@ func (mntns *MountNamespace) Root() VirtualDentry {
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
- vfs.mountMu.Lock()
- defer vfs.mountMu.Unlock()
rootMnt := taskRootDir.mount
+
+ vfs.mountMu.Lock()
mounts := rootMnt.submountsLocked()
+ // Take a reference on mounts since we need to drop vfs.mountMu before
+ // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
+ for _, mnt := range mounts {
+ mnt.IncRef()
+ }
+ vfs.mountMu.Unlock()
+ defer func() {
+ for _, mnt := range mounts {
+ mnt.DecRef(ctx)
+ }
+ }()
sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
for _, mnt := range mounts {
// Get the path to this mount relative to task root.
mntRootVD := VirtualDentry{
@@ -755,7 +766,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
if err != nil {
// For some reason we didn't get a path. Log a warning
// and run with empty path.
- ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
path = ""
}
if path == "" {
@@ -789,11 +800,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
- vfs.mountMu.Lock()
- defer vfs.mountMu.Unlock()
rootMnt := taskRootDir.mount
+
+ vfs.mountMu.Lock()
mounts := rootMnt.submountsLocked()
+ // Take a reference on mounts since we need to drop vfs.mountMu before
+ // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
+ // vfs.StatAt() (=> FilesystemImpl.StatAt()).
+ for _, mnt := range mounts {
+ mnt.IncRef()
+ }
+ vfs.mountMu.Unlock()
+ defer func() {
+ for _, mnt := range mounts {
+ mnt.DecRef(ctx)
+ }
+ }()
sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
+ creds := auth.CredentialsFromContext(ctx)
for _, mnt := range mounts {
// Get the path to this mount relative to task root.
mntRootVD := VirtualDentry{
@@ -804,7 +829,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
if err != nil {
// For some reason we didn't get a path. Log a warning
// and run with empty path.
- ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
path = ""
}
if path == "" {
@@ -817,9 +842,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
Root: mntRootVD,
Start: mntRootVD,
}
- statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+ statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
if err != nil {
// Well that's not good. Ignore this mount.
+ ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
break
}
@@ -831,6 +857,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
fmt.Fprintf(buf, "%d ", mnt.ID)
// (2) Parent ID (or this ID if there is no parent).
+ // Note that even if the call to mnt.parent() races with Mount
+ // destruction (which is possible since we're not holding vfs.mountMu),
+ // its Mount.ID will still be valid.
pID := mnt.ID
if p := mnt.parent(); p != nil {
pID = p.ID
@@ -879,30 +908,6 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
}
}
-// MakeSyntheticMountpoint creates parent directories of target if they do not
-// exist and attempts to create a directory for the mountpoint. If a
-// non-directory file already exists there then we allow it.
-func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
- mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-
- // Make sure the parent directory of target exists.
- if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
- return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
- }
-
- // Attempt to mkdir the final component. If a file (of any type) exists
- // then we let allow mounting on top of that because we do not require the
- // target to be an existing directory, unlike Linux mount(2).
- if err := vfs.MkdirAt(ctx, creds, &PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(target),
- }, mkdirOpts); err != nil && err != syserror.EEXIST {
- return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
- }
- return nil
-}
-
// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
// See Linux fs/seq_file.c:mangle_path.
func manglePath(p string) string {
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3335e4057..cb8c56bd3 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
mt.Init()
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.setKey(VirtualDentry{&Mount{}, &Dentry{}})
mt.Insert(mount)
if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false
func newBenchMount() *Mount {
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}})
return mount
}
@@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
mt.Insert(mount)
- keys = append(keys, mount.loadKey())
+ keys = append(keys, mount.saveKey())
}
var ready sync.WaitGroup
@@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.saveKey()
ms[key] = mount
keys = append(keys, key)
}
@@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.getKey()
ms.Store(key, mount)
keys = append(keys, key)
}
@@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms[mount.loadKey()] = mount
+ ms[mount.getKey()] = mount
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
var ms sync.Map
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
}
@@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
}
@@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- delete(ms, mount.loadKey())
+ delete(ms, mount.saveKey())
}
}
@@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
var ms sync.Map
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Delete(mount.loadKey())
+ ms.Delete(mount.saveKey())
}
}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index da2a2e9c4..b7d122d22 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -34,6 +34,8 @@ import (
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
+//
+// This is explicitly not savable.
type mountKey struct {
parent unsafe.Pointer // *Mount
point unsafe.Pointer // *Dentry
@@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
-func (mnt *Mount) loadKey() VirtualDentry {
+func (mnt *Mount) getKey() VirtualDentry {
return VirtualDentry{
mount: mnt.parent(),
dentry: mnt.point(),
}
}
+func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }
+
// Invariant: mnt.key.parent == nil. vd.Ok().
-func (mnt *Mount) storeKey(vd VirtualDentry) {
+func (mnt *Mount) setKey(vd VirtualDentry) {
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}
+func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }
+
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
@@ -92,6 +98,7 @@ type mountTable struct {
// length and cap in separate uint32s) for ~free.
size uint64
+ // FIXME(gvisor.dev/issue/1663): Slots need to be saved.
slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index dfc8573fd..bc79e5ecc 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -21,6 +21,8 @@ import (
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
// FilesystemImpl.GetDentryAt().
+//
+// +stateify savable
type GetDentryOptions struct {
// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
// the returned Dentry is a directory for which creds has search
@@ -30,6 +32,8 @@ type GetDentryOptions struct {
// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
// FilesystemImpl.MkdirAt().
+//
+// +stateify savable
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
@@ -56,6 +60,8 @@ type MkdirOptions struct {
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
// FilesystemImpl.MknodAt().
+//
+// +stateify savable
type MknodOptions struct {
// Mode is the file type and mode bits for the created file.
Mode linux.FileMode
@@ -72,6 +78,8 @@ type MknodOptions struct {
// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+//
+// +stateify savable
type MountFlags struct {
// NoExec is equivalent to MS_NOEXEC.
NoExec bool
@@ -93,6 +101,8 @@ type MountFlags struct {
}
// MountOptions contains options to VirtualFilesystem.MountAt().
+//
+// +stateify savable
type MountOptions struct {
// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
Flags MountFlags
@@ -103,13 +113,17 @@ type MountOptions struct {
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
GetFilesystemOptions GetFilesystemOptions
- // If InternalMount is true, allow the use of filesystem types for which
- // RegisterFilesystemTypeOptions.AllowUserMount == false.
+ // InternalMount indicates whether the mount operation is coming from the
+ // application, i.e. through mount(2). If InternalMount is true, allow the use
+ // of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount
+ // == false.
InternalMount bool
}
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
// FilesystemImpl.OpenAt().
+//
+// +stateify savable
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
@@ -135,6 +149,8 @@ type OpenOptions struct {
// ReadOptions contains options to FileDescription.PRead(),
// FileDescriptionImpl.PRead(), FileDescription.Read(), and
// FileDescriptionImpl.Read().
+//
+// +stateify savable
type ReadOptions struct {
// Flags contains flags as specified for preadv2(2).
Flags uint32
@@ -142,6 +158,8 @@ type ReadOptions struct {
// RenameOptions contains options to VirtualFilesystem.RenameAt() and
// FilesystemImpl.RenameAt().
+//
+// +stateify savable
type RenameOptions struct {
// Flags contains flags as specified for renameat2(2).
Flags uint32
@@ -153,6 +171,8 @@ type RenameOptions struct {
// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
// FileDescriptionImpl.SetStat().
+//
+// +stateify savable
type SetStatOptions struct {
// Stat is the metadata that should be set. Only fields indicated by
// Stat.Mask should be set.
@@ -174,6 +194,8 @@ type SetStatOptions struct {
// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
// and FilesystemImpl.BoundEndpointAt().
+//
+// +stateify savable
type BoundEndpointOptions struct {
// Addr is the path of the file whose socket endpoint is being retrieved.
// It is generally irrelevant: most endpoints are stored at a dentry that
@@ -190,10 +212,12 @@ type BoundEndpointOptions struct {
Addr string
}
-// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
-// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
-// FileDescriptionImpl.Getxattr().
-type GetxattrOptions struct {
+// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
+// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
+// FileDescriptionImpl.GetXattr().
+//
+// +stateify savable
+type GetXattrOptions struct {
// Name is the name of the extended attribute to retrieve.
Name string
@@ -204,10 +228,12 @@ type GetxattrOptions struct {
Size uint64
}
-// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
-// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
-// FileDescriptionImpl.Setxattr().
-type SetxattrOptions struct {
+// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
+// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
+// FileDescriptionImpl.SetXattr().
+//
+// +stateify savable
+type SetXattrOptions struct {
// Name is the name of the extended attribute being mutated.
Name string
@@ -221,6 +247,8 @@ type SetxattrOptions struct {
// StatOptions contains options to VirtualFilesystem.StatAt(),
// FilesystemImpl.StatAt(), FileDescription.Stat(), and
// FileDescriptionImpl.Stat().
+//
+// +stateify savable
type StatOptions struct {
// Mask is the set of fields in the returned Statx that the FilesystemImpl
// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
@@ -238,6 +266,8 @@ type StatOptions struct {
}
// UmountOptions contains options to VirtualFilesystem.UmountAt().
+//
+// +stateify savable
type UmountOptions struct {
// Flags contains flags as specified for umount2(2).
Flags uint32
@@ -246,6 +276,8 @@ type UmountOptions struct {
// WriteOptions contains options to FileDescription.PWrite(),
// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
// FileDescriptionImpl.Write().
+//
+// +stateify savable
type WriteOptions struct {
// Flags contains flags as specified for pwritev2(2).
Flags uint32
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 014b928ed..d48520d58 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -16,6 +16,7 @@ package vfs
import (
"math"
+ "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -25,6 +26,8 @@ import (
)
// AccessTypes is a bitmask of Unix file permissions.
+//
+// +stateify savable
type AccessTypes uint16
// Bits in AccessTypes.
@@ -284,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
}
return size, nil
}
+
+// CheckXattrPermissions checks permissions for extended attribute access.
+// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
+// * Does not check for read-only filesystem property.
+// * Does not check inode immutability or append only mode. In both cases EPERM
+// must be returned by filesystem implementations.
+// * Does not do inode permission checks. Filesystem implementations should
+// handle inode permission checks as they may differ across implementations.
+func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
+ switch {
+ case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+ // The trusted.* namespace can only be accessed by privileged
+ // users.
+ if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+ return nil
+ }
+ if ats.MayWrite() {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+ // In the user.* namespace, only regular files and directories can have
+ // extended attributes. For sticky directories, only the owner and
+ // privileged users can write attributes.
+ filetype := mode.FileType()
+ if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
+ if ats.MayWrite() {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ }
+ if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
+ return syserror.EPERM
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 3304372d9..e4fd55012 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -35,6 +35,8 @@ import (
// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
+//
+// +stateify savable
type ResolvingPath struct {
vfs *VirtualFilesystem
root VirtualDentry // refs borrowed from PathOperation
@@ -88,6 +90,7 @@ func init() {
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.
+// +stateify savable
type resolveMountRootOrJumpError struct{}
// Error implements error.Error.
@@ -95,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string {
return "resolving mount root or jump"
}
+// +stateify savable
type resolveMountPointError struct{}
// Error implements error.Error.
@@ -102,6 +106,7 @@ func (resolveMountPointError) Error() string {
return "resolving mount point"
}
+// +stateify savable
type resolveAbsSymlinkError struct{}
// Error implements error.Error.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index ec27562d6..5bd756ea5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -163,6 +163,8 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
// it's somewhat large and should never escape. (Options structs are passed by
// pointer to VFS and FileDescription methods for the same reason.)
+//
+// +stateify savable
type PathOperation struct {
// Root is the VFS root. References on Root are borrowed from the provider
// of the PathOperation.
@@ -297,6 +299,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
// MkdirAt creates a directory at the given path.
func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with mkdirat(dirfd, "", mode).
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -333,6 +337,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
// error from the syserror package.
func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with mknodat(dirfd, "", mode, dev).
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -518,6 +524,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
// RmdirAt removes the directory at the given path.
func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
if pop.Path.Absolute {
return syserror.EBUSY
}
@@ -599,6 +607,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
// SymlinkAt creates a symbolic link at the given path with the given target.
func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with symlinkat(oldpath, newdirfd, "").
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -631,6 +641,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
// UnlinkAt deletes the non-directory file at the given path.
func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with unlinkat(dirfd, "", 0).
if pop.Path.Absolute {
return syserror.EBUSY
}
@@ -662,12 +674,6 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
- if !pop.Path.Begin.Ok() {
- if pop.Path.Absolute {
- return nil, syserror.ECONNREFUSED
- }
- return nil, syserror.ENOENT
- }
rp := vfs.getResolvingPath(creds, pop)
for {
bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
@@ -687,12 +693,12 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
}
}
-// ListxattrAt returns all extended attribute names for the file at the given
+// ListXattrAt returns all extended attribute names for the file at the given
// path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+ names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
if err == nil {
vfs.putResolvingPath(ctx, rp)
return names, nil
@@ -712,12 +718,12 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
}
}
-// GetxattrAt returns the value associated with the given extended attribute
+// GetXattrAt returns the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+ val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(ctx, rp)
return val, nil
@@ -729,12 +735,12 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
}
}
-// SetxattrAt changes the value associated with the given extended attribute
+// SetXattrAt changes the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
rp := vfs.getResolvingPath(creds, pop)
for {
- err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(ctx, rp)
return nil
@@ -746,11 +752,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
}
}
-// RemovexattrAt removes the given extended attribute from the file at rp.
-func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+// RemoveXattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
rp := vfs.getResolvingPath(creds, pop)
for {
- err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
if err == nil {
vfs.putResolvingPath(ctx, rp)
return nil
@@ -815,6 +821,30 @@ func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string
return nil
}
+// MakeSyntheticMountpoint creates parent directories of target if they do not
+// exist and attempts to create a directory for the mountpoint. If a
+// non-directory file already exists there then we allow it.
+func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
+ mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+
+ // Make sure the parent directory of target exists.
+ if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
+ return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
+ }
+
+ // Attempt to mkdir the final component. If a file (of any type) exists
+ // then we let allow mounting on top of that because we do not require the
+ // target to be an existing directory, unlike Linux mount(2).
+ if err := vfs.MkdirAt(ctx, creds, &PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(target),
+ }, mkdirOpts); err != nil && err != syserror.EEXIST {
+ return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
+ }
+ return nil
+}
+
// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
// (which represents a node in a Filesystem's tree) and a Mount (which
// represents the Filesystem's position in a VFS mount tree).