summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/arch/BUILD4
-rw-r--r--pkg/sentry/arch/arch.go7
-rw-r--r--pkg/sentry/arch/arch_aarch64.go37
-rw-r--r--pkg/sentry/arch/arch_amd64.go16
-rw-r--r--pkg/sentry/arch/arch_arm64.go18
-rw-r--r--pkg/sentry/arch/arch_x86.go16
-rw-r--r--pkg/sentry/arch/registers.proto1
-rw-r--r--pkg/sentry/arch/signal_act.go2
-rw-r--r--pkg/sentry/arch/signal_amd64.go27
-rw-r--r--pkg/sentry/arch/signal_arm64.go30
-rw-r--r--pkg/sentry/arch/signal_stack.go2
-rw-r--r--pkg/sentry/arch/stack.go179
-rw-r--r--pkg/sentry/arch/stack_unsafe.go69
-rw-r--r--pkg/sentry/contexttest/contexttest.go22
-rw-r--r--pkg/sentry/control/BUILD5
-rw-r--r--pkg/sentry/control/logging.go4
-rw-r--r--pkg/sentry/control/pprof.go6
-rw-r--r--pkg/sentry/control/proc.go116
-rw-r--r--pkg/sentry/control/state.go2
-rw-r--r--pkg/sentry/device/device.go5
-rw-r--r--pkg/sentry/devices/memdev/BUILD5
-rw-r--r--pkg/sentry/devices/memdev/full.go7
-rw-r--r--pkg/sentry/devices/memdev/null.go7
-rw-r--r--pkg/sentry/devices/memdev/random.go7
-rw-r--r--pkg/sentry/devices/memdev/zero.go31
-rw-r--r--pkg/sentry/devices/ttydev/BUILD16
-rw-r--r--pkg/sentry/devices/ttydev/ttydev.go53
-rw-r--r--pkg/sentry/devices/tundev/BUILD24
-rw-r--r--pkg/sentry/devices/tundev/tundev.go191
-rw-r--r--pkg/sentry/fdimport/BUILD1
-rw-r--r--pkg/sentry/fdimport/fdimport.go35
-rw-r--r--pkg/sentry/fs/copy_up.go25
-rw-r--r--pkg/sentry/fs/copy_up_test.go4
-rw-r--r--pkg/sentry/fs/dev/BUILD1
-rw-r--r--pkg/sentry/fs/dev/net_tun.go69
-rw-r--r--pkg/sentry/fs/dirent.go122
-rw-r--r--pkg/sentry/fs/dirent_cache.go3
-rw-r--r--pkg/sentry/fs/dirent_refs_test.go16
-rw-r--r--pkg/sentry/fs/dirent_state.go3
-rw-r--r--pkg/sentry/fs/fdpipe/pipe.go2
-rw-r--r--pkg/sentry/fs/fdpipe/pipe_opener_test.go16
-rw-r--r--pkg/sentry/fs/fdpipe/pipe_test.go18
-rw-r--r--pkg/sentry/fs/file.go21
-rw-r--r--pkg/sentry/fs/file_operations.go6
-rw-r--r--pkg/sentry/fs/file_overlay.go22
-rw-r--r--pkg/sentry/fs/filesystems.go14
-rw-r--r--pkg/sentry/fs/fs.go3
-rw-r--r--pkg/sentry/fs/fsutil/BUILD7
-rw-r--r--pkg/sentry/fs/fsutil/dirty_set.go7
-rw-r--r--pkg/sentry/fs/fsutil/file.go4
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go46
-rw-r--r--pkg/sentry/fs/fsutil/frame_ref_set.go10
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper.go24
-rw-r--r--pkg/sentry/fs/fsutil/host_mappable.go19
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go46
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md102
-rw-r--r--pkg/sentry/fs/gofer/file.go4
-rw-r--r--pkg/sentry/fs/gofer/gofer_test.go8
-rw-r--r--pkg/sentry/fs/gofer/handles.go5
-rw-r--r--pkg/sentry/fs/gofer/inode.go7
-rw-r--r--pkg/sentry/fs/gofer/path.go6
-rw-r--r--pkg/sentry/fs/gofer/session.go16
-rw-r--r--pkg/sentry/fs/gofer/session_state.go3
-rw-r--r--pkg/sentry/fs/gofer/socket.go6
-rw-r--r--pkg/sentry/fs/host/BUILD2
-rw-r--r--pkg/sentry/fs/host/control.go2
-rw-r--r--pkg/sentry/fs/host/file.go4
-rw-r--r--pkg/sentry/fs/host/inode.go3
-rw-r--r--pkg/sentry/fs/host/inode_test.go2
-rw-r--r--pkg/sentry/fs/host/socket.go20
-rw-r--r--pkg/sentry/fs/host/socket_iovec.go7
-rw-r--r--pkg/sentry/fs/host/socket_test.go38
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fs/host/tty.go54
-rw-r--r--pkg/sentry/fs/host/wait_test.go2
-rw-r--r--pkg/sentry/fs/inode.go13
-rw-r--r--pkg/sentry/fs/inode_inotify.go5
-rw-r--r--pkg/sentry/fs/inode_overlay.go41
-rw-r--r--pkg/sentry/fs/inode_overlay_test.go8
-rw-r--r--pkg/sentry/fs/inotify.go8
-rw-r--r--pkg/sentry/fs/inotify_watch.go9
-rw-r--r--pkg/sentry/fs/lock/lock.go41
-rw-r--r--pkg/sentry/fs/lock/lock_set_functions.go8
-rw-r--r--pkg/sentry/fs/lock/lock_test.go111
-rw-r--r--pkg/sentry/fs/mount.go12
-rw-r--r--pkg/sentry/fs/mount_overlay.go6
-rw-r--r--pkg/sentry/fs/mount_test.go29
-rw-r--r--pkg/sentry/fs/mounts.go30
-rw-r--r--pkg/sentry/fs/mounts_test.go2
-rw-r--r--pkg/sentry/fs/overlay.go30
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/fds.go18
-rw-r--r--pkg/sentry/fs/proc/mounts.go8
-rw-r--r--pkg/sentry/fs/proc/net.go12
-rw-r--r--pkg/sentry/fs/proc/proc.go2
-rw-r--r--pkg/sentry/fs/proc/sys_net.go215
-rw-r--r--pkg/sentry/fs/proc/sys_net_state.go15
-rw-r--r--pkg/sentry/fs/proc/sys_net_test.go73
-rw-r--r--pkg/sentry/fs/proc/task.go133
-rw-r--r--pkg/sentry/fs/ramfs/dir.go18
-rw-r--r--pkg/sentry/fs/ramfs/tree_test.go2
-rw-r--r--pkg/sentry/fs/timerfd/timerfd.go4
-rw-r--r--pkg/sentry/fs/tmpfs/file_test.go2
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go2
-rw-r--r--pkg/sentry/fs/tmpfs/tmpfs.go12
-rw-r--r--pkg/sentry/fs/tty/BUILD3
-rw-r--r--pkg/sentry/fs/tty/dir.go54
-rw-r--r--pkg/sentry/fs/tty/fs.go6
-rw-r--r--pkg/sentry/fs/tty/line_discipline.go55
-rw-r--r--pkg/sentry/fs/tty/master.go45
-rw-r--r--pkg/sentry/fs/tty/queue.go23
-rw-r--r--pkg/sentry/fs/tty/replica.go (renamed from pkg/sentry/fs/tty/slave.go)90
-rw-r--r--pkg/sentry/fs/tty/terminal.go39
-rw-r--r--pkg/sentry/fs/tty/tty_test.go4
-rw-r--r--pkg/sentry/fs/user/BUILD1
-rw-r--r--pkg/sentry/fs/user/path.go66
-rw-r--r--pkg/sentry/fs/user/user.go9
-rw-r--r--pkg/sentry/fs/user/user_test.go8
-rw-r--r--pkg/sentry/fsbridge/bridge.go2
-rw-r--r--pkg/sentry/fsbridge/fs.go8
-rw-r--r--pkg/sentry/fsbridge/vfs.go8
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD22
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go157
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts_test.go4
-rw-r--r--pkg/sentry/fsimpl/devpts/line_discipline.go57
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go76
-rw-r--r--pkg/sentry/fsimpl/devpts/queue.go23
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go201
-rw-r--r--pkg/sentry/fsimpl/devpts/slave.go182
-rw-r--r--pkg/sentry/fsimpl/devpts/terminal.go37
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/BUILD6
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go41
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go198
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/save_restore.go23
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go19
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd_test.go12
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD9
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go15
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_file.go41
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_test.go77
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go16
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go35
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/BUILD3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_32.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_64.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent.go3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_new.go4
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_old.go4
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/disklayout.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent.go12
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent_test.go9
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode.go3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_new.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_old.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock.go6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_32.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_64.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_old.go2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_test.go9
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/test_utils.go6
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go18
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go11
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go12
-rw-r--r--pkg/sentry/fsimpl/ext/extent_test.go41
-rw-r--r--pkg/sentry/fsimpl/ext/file_description.go1
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go102
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go30
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go37
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go22
-rw-r--r--pkg/sentry/fsimpl/ext/utils.go8
-rw-r--r--pkg/sentry/fsimpl/fuse/BUILD87
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go322
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_control.go247
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_test.go117
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go463
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go323
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/file.go133
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go822
-rw-r--r--pkg/sentry/fsimpl/fuse/read_write.go244
-rw-r--r--pkg/sentry/fsimpl/fuse/register.go42
-rw-r--r--pkg/sentry/fsimpl/fuse/regular_file.go230
-rw-r--r--pkg/sentry/fsimpl/fuse/request_response.go229
-rw-r--r--pkg/sentry/fsimpl/fuse/utils_test.go132
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD6
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go69
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go549
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go1214
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go15
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go16
-rw-r--r--pkg/sentry/fsimpl/gofer/host_named_pipe.go20
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go356
-rw-r--r--pkg/sentry/fsimpl/gofer/save_restore.go329
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go15
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go256
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go47
-rw-r--r--pkg/sentry/fsimpl/host/BUILD31
-rw-r--r--pkg/sentry/fsimpl/host/control.go4
-rw-r--r--pkg/sentry/fsimpl/host/host.go473
-rw-r--r--pkg/sentry/fsimpl/host/mmap.go27
-rw-r--r--pkg/sentry/fsimpl/host/save_restore.go78
-rw-r--r--pkg/sentry/fsimpl/host/socket.go39
-rw-r--r--pkg/sentry/fsimpl/host/socket_iovec.go7
-rw-r--r--pkg/sentry/fsimpl/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fsimpl/host/tty.go75
-rw-r--r--pkg/sentry/fsimpl/host/util.go16
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD75
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go39
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go107
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go451
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go400
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go382
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go171
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go20
-rw-r--r--pkg/sentry/fsimpl/kernfs/synthetic_directory.go113
-rw-r--r--pkg/sentry/fsimpl/overlay/BUILD46
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go429
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go301
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go1780
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go798
-rw-r--r--pkg/sentry/fsimpl/overlay/regular_file.go456
-rw-r--r--pkg/sentry/fsimpl/overlay/save_restore.go27
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go32
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD65
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go66
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go59
-rw-r--r--pkg/sentry/fsimpl/proc/task.go119
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go157
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go429
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go65
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go91
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go48
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go330
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys_test.go71
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go27
-rw-r--r--pkg/sentry/fsimpl/signalfd/BUILD1
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go29
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go33
-rw-r--r--pkg/sentry/fsimpl/sys/BUILD23
-rw-r--r--pkg/sentry/fsimpl/sys/kcov.go118
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go163
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go6
-rw-r--r--pkg/sentry/fsimpl/testutil/BUILD2
-rw-r--r--pkg/sentry/fsimpl/testutil/kernel.go21
-rw-r--r--pkg/sentry/fsimpl/testutil/testutil.go16
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go15
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD16
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go58
-rw-r--r--pkg/sentry/fsimpl/tmpfs/device_file.go5
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go23
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go280
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go9
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go23
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go182
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go33
-rw-r--r--pkg/sentry/fsimpl/tmpfs/save_restore.go20
-rw-r--r--pkg/sentry/fsimpl/tmpfs/socket_file.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go5
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go424
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go9
-rw-r--r--pkg/sentry/fsimpl/verity/BUILD51
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go1046
-rw-r--r--pkg/sentry/fsimpl/verity/save_restore.go27
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go880
-rw-r--r--pkg/sentry/fsimpl/verity/verity_test.go700
-rw-r--r--pkg/sentry/hostmm/BUILD3
-rw-r--r--pkg/sentry/hostmm/membarrier.go90
-rw-r--r--pkg/sentry/inet/BUILD1
-rw-r--r--pkg/sentry/inet/inet.go34
-rw-r--r--pkg/sentry/inet/test_stack.go50
-rw-r--r--pkg/sentry/kernel/BUILD77
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go82
-rw-r--r--pkg/sentry/kernel/aio.go81
-rw-r--r--pkg/sentry/kernel/auth/BUILD1
-rw-r--r--pkg/sentry/kernel/auth/context.go20
-rw-r--r--pkg/sentry/kernel/auth/credentials.go28
-rw-r--r--pkg/sentry/kernel/auth/id.go4
-rw-r--r--pkg/sentry/kernel/context.go56
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go45
-rw-r--r--pkg/sentry/kernel/epoll/epoll_state.go3
-rw-r--r--pkg/sentry/kernel/epoll/epoll_test.go5
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go4
-rw-r--r--pkg/sentry/kernel/fasync/BUILD1
-rw-r--r--pkg/sentry/kernel/fasync/fasync.go18
-rw-r--r--pkg/sentry/kernel/fd_table.go266
-rw-r--r--pkg/sentry/kernel/fd_table_test.go14
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go74
-rw-r--r--pkg/sentry/kernel/fs_context.go124
-rw-r--r--pkg/sentry/kernel/futex/BUILD1
-rw-r--r--pkg/sentry/kernel/futex/futex.go43
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go66
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go14
-rw-r--r--pkg/sentry/kernel/kcov.go338
-rw-r--r--pkg/sentry/kernel/kcov_unsafe.go28
-rw-r--r--pkg/sentry/kernel/kernel.go432
-rw-r--r--pkg/sentry/kernel/pipe/BUILD2
-rw-r--r--pkg/sentry/kernel/pipe/node.go6
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go2
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go19
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go16
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go14
-rw-r--r--pkg/sentry/kernel/pipe/reader.go3
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go111
-rw-r--r--pkg/sentry/kernel/pipe/writer.go3
-rw-r--r--pkg/sentry/kernel/ptrace.go58
-rw-r--r--pkg/sentry/kernel/ptrace_amd64.go2
-rw-r--r--pkg/sentry/kernel/rseq.go31
-rw-r--r--pkg/sentry/kernel/seccomp.go46
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go29
-rw-r--r--pkg/sentry/kernel/sessions.go28
-rw-r--r--pkg/sentry/kernel/shm/BUILD15
-rw-r--r--pkg/sentry/kernel/shm/shm.go72
-rw-r--r--pkg/sentry/kernel/signalfd/BUILD1
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go16
-rw-r--r--pkg/sentry/kernel/syscalls.go10
-rw-r--r--pkg/sentry/kernel/syslog.go9
-rw-r--r--pkg/sentry/kernel/task.go49
-rw-r--r--pkg/sentry/kernel/task_clone.go40
-rw-r--r--pkg/sentry/kernel/task_context.go6
-rw-r--r--pkg/sentry/kernel/task_exec.go17
-rw-r--r--pkg/sentry/kernel/task_exit.go17
-rw-r--r--pkg/sentry/kernel/task_futex.go126
-rw-r--r--pkg/sentry/kernel/task_log.go45
-rw-r--r--pkg/sentry/kernel/task_run.go26
-rw-r--r--pkg/sentry/kernel/task_sched.go11
-rw-r--r--pkg/sentry/kernel/task_signals.go34
-rw-r--r--pkg/sentry/kernel/task_start.go13
-rw-r--r--pkg/sentry/kernel/task_stop.go30
-rw-r--r--pkg/sentry/kernel/task_syscall.go76
-rw-r--r--pkg/sentry/kernel/task_usermem.go64
-rw-r--r--pkg/sentry/kernel/task_work.go38
-rw-r--r--pkg/sentry/kernel/thread_group.go10
-rw-r--r--pkg/sentry/kernel/threads.go16
-rw-r--r--pkg/sentry/kernel/time/BUILD1
-rw-r--r--pkg/sentry/kernel/time/tcpip.go131
-rw-r--r--pkg/sentry/kernel/time/time.go6
-rw-r--r--pkg/sentry/kernel/timekeeper.go9
-rw-r--r--pkg/sentry/kernel/vdso.go28
-rw-r--r--pkg/sentry/limits/context.go9
-rw-r--r--pkg/sentry/loader/BUILD4
-rw-r--r--pkg/sentry/loader/elf.go40
-rw-r--r--pkg/sentry/loader/loader.go41
-rw-r--r--pkg/sentry/loader/vdso.go67
-rw-r--r--pkg/sentry/memmap/BUILD14
-rw-r--r--pkg/sentry/memmap/mapping_set.go4
-rw-r--r--pkg/sentry/memmap/memmap.go116
-rw-r--r--pkg/sentry/mm/BUILD29
-rw-r--r--pkg/sentry/mm/address_space.go8
-rw-r--r--pkg/sentry/mm/aio_context.go14
-rw-r--r--pkg/sentry/mm/io.go9
-rw-r--r--pkg/sentry/mm/lifecycle.go4
-rw-r--r--pkg/sentry/mm/metadata.go5
-rw-r--r--pkg/sentry/mm/mm.go26
-rw-r--r--pkg/sentry/mm/mm_test.go3
-rw-r--r--pkg/sentry/mm/pma.go134
-rw-r--r--pkg/sentry/mm/special_mappable.go25
-rw-r--r--pkg/sentry/mm/syscalls.go49
-rw-r--r--pkg/sentry/mm/vma.go63
-rw-r--r--pkg/sentry/pgalloc/BUILD30
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go410
-rw-r--r--pkg/sentry/pgalloc/pgalloc_test.go206
-rw-r--r--pkg/sentry/pgalloc/save_restore.go13
-rw-r--r--pkg/sentry/platform/BUILD21
-rw-r--r--pkg/sentry/platform/interrupt/interrupt.go5
-rw-r--r--pkg/sentry/platform/kvm/BUILD18
-rw-r--r--pkg/sentry/platform/kvm/address_space.go82
-rw-r--r--pkg/sentry/platform/kvm/bluepill_allocator.go (renamed from pkg/sentry/platform/kvm/allocator.go)52
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.s7
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go61
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go32
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.s12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go36
-rw-r--r--pkg/sentry/platform/kvm/bluepill_fault.go4
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go76
-rw-r--r--pkg/sentry/platform/kvm/context.go15
-rw-r--r--pkg/sentry/platform/kvm/filters_amd64.go13
-rw-r--r--pkg/sentry/platform/kvm/filters_arm64.go11
-rw-r--r--pkg/sentry/platform/kvm/kvm.go13
-rw-r--r--pkg/sentry/platform/kvm/kvm_amd64_test.go51
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64.go15
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64_test.go31
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64_unsafe.go4
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go27
-rw-r--r--pkg/sentry/platform/kvm/kvm_const_arm64.go42
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go53
-rw-r--r--pkg/sentry/platform/kvm/machine.go173
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go184
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go117
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go98
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go113
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go30
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.go9
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.s28
-rw-r--r--pkg/sentry/platform/kvm/virtual_map.go2
-rw-r--r--pkg/sentry/platform/platform.go164
-rw-r--r--pkg/sentry/platform/ptrace/BUILD3
-rw-r--r--pkg/sentry/platform/ptrace/filters.go9
-rw-r--r--pkg/sentry/platform/ptrace/ptrace.go11
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go14
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go10
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go61
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_unsafe.go2
-rw-r--r--pkg/sentry/platform/ring0/aarch64.go35
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go38
-rw-r--r--pkg/sentry/platform/ring0/defs_arm64.go3
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.go7
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.s204
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s181
-rw-r--r--pkg/sentry/platform/ring0/gen_offsets/BUILD5
-rw-r--r--pkg/sentry/platform/ring0/kernel.go46
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go76
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go19
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.go12
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.s47
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.go17
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.s102
-rw-r--r--pkg/sentry/platform/ring0/offsets_amd64.go11
-rw-r--r--pkg/sentry/platform/ring0/offsets_arm64.go2
-rw-r--r--pkg/sentry/platform/ring0/pagetables/allocator.go11
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go8
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go11
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go2
-rw-r--r--pkg/sentry/platform/ring0/x86.go40
-rw-r--r--pkg/sentry/socket/BUILD1
-rw-r--r--pkg/sentry/socket/control/control.go8
-rw-r--r--pkg/sentry/socket/control/control_vfs2.go18
-rw-r--r--pkg/sentry/socket/hostinet/BUILD6
-rw-r--r--pkg/sentry/socket/hostinet/socket.go25
-rw-r--r--pkg/sentry/socket/hostinet/socket_vfs2.go27
-rw-r--r--pkg/sentry/socket/hostinet/stack.go58
-rw-r--r--pkg/sentry/socket/netfilter/BUILD2
-rw-r--r--pkg/sentry/socket/netfilter/extensions.go72
-rw-r--r--pkg/sentry/socket/netfilter/ipv4.go265
-rw-r--r--pkg/sentry/socket/netfilter/ipv6.go270
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go663
-rw-r--r--pkg/sentry/socket/netfilter/owner_matcher.go2
-rw-r--r--pkg/sentry/socket/netfilter/targets.go373
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go70
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go68
-rw-r--r--pkg/sentry/socket/netlink/BUILD3
-rw-r--r--pkg/sentry/socket/netlink/provider.go2
-rw-r--r--pkg/sentry/socket/netlink/provider_vfs2.go3
-rw-r--r--pkg/sentry/socket/netlink/route/protocol.go62
-rw-r--r--pkg/sentry/socket/netlink/socket.go35
-rw-r--r--pkg/sentry/socket/netlink/socket_vfs2.go31
-rw-r--r--pkg/sentry/socket/netstack/BUILD4
-rw-r--r--pkg/sentry/socket/netstack/netstack.go883
-rw-r--r--pkg/sentry/socket/netstack/netstack_vfs2.go114
-rw-r--r--pkg/sentry/socket/netstack/stack.go171
-rw-r--r--pkg/sentry/socket/socket.go8
-rw-r--r--pkg/sentry/socket/unix/BUILD29
-rw-r--r--pkg/sentry/socket/unix/transport/BUILD13
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go37
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go18
-rw-r--r--pkg/sentry/socket/unix/transport/queue.go24
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go109
-rw-r--r--pkg/sentry/socket/unix/unix.go145
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go100
-rw-r--r--pkg/sentry/state/BUILD2
-rw-r--r--pkg/sentry/state/state.go17
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/epoll.go14
-rw-r--r--pkg/sentry/strace/socket.go26
-rw-r--r--pkg/sentry/strace/strace.go49
-rw-r--r--pkg/sentry/syscalls/epoll.go18
-rw-r--r--pkg/sentry/syscalls/linux/BUILD4
-rw-r--r--pkg/sentry/syscalls/linux/error.go14
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go34
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go206
-rw-r--r--pkg/sentry/syscalls/linux/sys_capability.go16
-rw-r--r--pkg/sentry/syscalls/linux/sys_eventfd.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go204
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go66
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_identity.go16
-rw-r--r--pkg/sentry/syscalls/linux/sys_inotify.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_lseek.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_membarrier.go103
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go17
-rw-r--r--pkg/sentry/syscalls/linux/sys_mount.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go30
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go24
-rw-r--r--pkg/sentry/syscalls/linux/sys_rlimit.go19
-rw-r--r--pkg/sentry/syscalls/linux/sys_rusage.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go31
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go21
-rw-r--r--pkg/sentry/syscalls/linux/sys_signal.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go144
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go29
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_sync.go14
-rw-r--r--pkg/sentry/syscalls/linux/sys_sysinfo.go14
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go23
-rw-r--r--pkg/sentry/syscalls/linux/sys_time.go11
-rw-r--r--pkg/sentry/syscalls/linux/sys_timer.go101
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_tls_amd64.go13
-rw-r--r--pkg/sentry/syscalls/linux/sys_utsname.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go22
-rw-r--r--pkg/sentry/syscalls/linux/sys_xattr.go8
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD10
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/aio.go226
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/epoll.go63
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/eventfd.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/execve.go13
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go199
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/filesystem.go38
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fscontext.go22
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/getdents.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/inotify.go23
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/ioctl.go75
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/lock.go64
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/memfd.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mmap.go16
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mount.go150
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/path.go12
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/pipe.go11
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/poll.go42
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/read_write.go99
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/setstat.go76
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/signal.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go144
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go387
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/stat.go30
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/sync.go48
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/timerfd.go14
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go134
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/xattr.go48
-rw-r--r--pkg/sentry/time/muldiv_arm64.s3
-rw-r--r--pkg/sentry/time/parameters.go12
-rw-r--r--pkg/sentry/time/parameters_test.go15
-rw-r--r--pkg/sentry/usage/memory.go2
-rw-r--r--pkg/sentry/vfs/BUILD41
-rw-r--r--pkg/sentry/vfs/README.md15
-rw-r--r--pkg/sentry/vfs/anonfs.go41
-rw-r--r--pkg/sentry/vfs/context.go24
-rw-r--r--pkg/sentry/vfs/dentry.go66
-rw-r--r--pkg/sentry/vfs/device.go3
-rw-r--r--pkg/sentry/vfs/epoll.go38
-rw-r--r--pkg/sentry/vfs/file_description.go326
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go136
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go27
-rw-r--r--pkg/sentry/vfs/filesystem.go116
-rw-r--r--pkg/sentry/vfs/filesystem_impl_util.go13
-rw-r--r--pkg/sentry/vfs/filesystem_type.go12
-rw-r--r--pkg/sentry/vfs/g3doc/inotify.md210
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go16
-rw-r--r--pkg/sentry/vfs/inotify.go285
-rw-r--r--pkg/sentry/vfs/lock.go (renamed from pkg/sentry/vfs/lock/lock.go)66
-rw-r--r--pkg/sentry/vfs/lock/BUILD13
-rw-r--r--pkg/sentry/vfs/memxattr/xattr.go16
-rw-r--r--pkg/sentry/vfs/mount.go273
-rw-r--r--pkg/sentry/vfs/mount_test.go26
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go50
-rw-r--r--pkg/sentry/vfs/options.go73
-rw-r--r--pkg/sentry/vfs/pathname.go18
-rw-r--r--pkg/sentry/vfs/permissions.go95
-rw-r--r--pkg/sentry/vfs/resolving_path.go48
-rw-r--r--pkg/sentry/vfs/save_restore.go124
-rw-r--r--pkg/sentry/vfs/vfs.go297
-rw-r--r--pkg/sentry/watchdog/watchdog.go42
572 files changed, 30555 insertions, 9511 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 901e0f320..4af4d6e84 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -22,6 +22,7 @@ go_library(
"signal_info.go",
"signal_stack.go",
"stack.go",
+ "stack_unsafe.go",
"syscalls_amd64.go",
"syscalls_arm64.go",
],
@@ -33,11 +34,12 @@ go_library(
"//pkg/context",
"//pkg/cpuid",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/limits",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index a903d031c..d75d665ae 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -72,12 +73,12 @@ type Context interface {
// with return values of varying sizes (for example ARCH_GETFS). This
// is a simple utility function to convert to the native size in these
// cases, and then we can CopyOut.
- Native(val uintptr) interface{}
+ Native(val uintptr) marshal.Marshallable
// Value converts a native type back to a generic value.
// Once a value has been converted to native via the above call -- it
// can be converted back here.
- Value(val interface{}) uintptr
+ Value(val marshal.Marshallable) uintptr
// Width returns the number of bytes for a native value.
Width() uint
@@ -205,7 +206,7 @@ type Context interface {
// equivalent of arch_ptrace():
// PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
- PtracePeekUser(addr uintptr) (interface{}, error)
+ PtracePeekUser(addr uintptr) (marshal.Marshallable, error)
// PtracePokeUser implements ptrace(PTRACE_POKEUSR).
PtracePokeUser(addr, data uintptr) error
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 343f81f59..fd73751e7 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,7 +17,6 @@
package arch
import (
- "encoding/binary"
"fmt"
"io"
@@ -29,7 +28,14 @@ import (
)
// Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+ linux.PtraceRegs
+
+ // TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register.
+ TPIDR_EL0 uint64
+}
const (
// SyscallWidth is the width of insturctions.
@@ -49,9 +55,14 @@ const ARMTrapFlag = uint64(1) << 21
type aarch64FPState []byte
// initAarch64FPState sets up initial state.
+//
+// Related code in Linux kernel: fpsimd_flush_thread().
+// FPCR = FPCR_RM_RN (0x0 << 22).
+//
+// Currently, aarch64FPState is only a space of 0x210 length for fpstate.
+// The fp head is useless in sentry/ptrace/kvm.
+//
func initAarch64FPState(data aarch64FPState) {
- binary.LittleEndian.PutUint32(data, fpsimdMagic)
- binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
}
func newAarch64FPStateSlice() []byte {
@@ -90,6 +101,8 @@ func NewFloatingPointData() *FloatingPointData {
// State contains the common architecture bits for aarch64 (the build tag of this
// file ensures it's only built on aarch64).
+//
+// +stateify savable
type State struct {
// The system registers.
Regs Registers
@@ -97,9 +110,6 @@ type State struct {
// Our floating point state.
aarch64FPState `state:"wait"`
- // TLS pointer
- TPValue uint64
-
// FeatureSet is a pointer to the currently active feature set.
FeatureSet *cpuid.FeatureSet
@@ -144,6 +154,7 @@ func (s State) Proto() *rpb.Registers {
Sp: s.Regs.Sp,
Pc: s.Regs.Pc,
Pstate: s.Regs.Pstate,
+ Tls: s.Regs.TPIDR_EL0,
}
return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}}
}
@@ -153,7 +164,6 @@ func (s *State) Fork() State {
return State{
Regs: s.Regs,
aarch64FPState: s.aarch64FPState.fork(),
- TPValue: s.TPValue,
FeatureSet: s.FeatureSet,
OrigR0: s.OrigR0,
}
@@ -223,6 +233,7 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
"Sp": uintptr(s.Regs.Sp),
"Pc": uintptr(s.Regs.Pc),
"Pstate": uintptr(s.Regs.Pstate),
+ "Tls": uintptr(s.Regs.TPIDR_EL0),
}, nil
}
@@ -237,18 +248,18 @@ func (s *State) ptraceGetRegs() Registers {
return s.Regs
}
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
// PtraceSetRegs implements Context.PtraceSetRegs.
func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
var regs Registers
- buf := make([]byte, registersSize)
+ buf := make([]byte, ptraceRegistersSize)
if _, err := io.ReadFull(src, buf); err != nil {
return 0, err
}
regs.UnmarshalUnsafe(buf)
s.Regs = regs
- return registersSize, nil
+ return ptraceRegistersSize, nil
}
// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
@@ -274,7 +285,7 @@ const (
func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceGetRegs(dst)
@@ -287,7 +298,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 3b3a0a272..c7d3a206d 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -23,6 +23,8 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -179,14 +181,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
}
// Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
- v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+ v := primitive.Uint64(val)
return &v
}
// Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
- return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+ return uintptr(*val.(*primitive.Uint64))
}
// Width returns the byte width of this architecture.
@@ -293,14 +295,14 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
const userStructSize = 928
// PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
if addr&7 != 0 || addr >= userStructSize {
return nil, syscall.EIO
}
// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
// u_debugreg, returning 0 or silently no-oping for other fields
// respectively.
- if addr < uintptr(registersSize) {
+ if addr < uintptr(ptraceRegistersSize) {
regs := c.ptraceGetRegs()
buf := make([]byte, regs.SizeBytes())
regs.MarshalUnsafe(buf)
@@ -315,7 +317,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
if addr&7 != 0 || addr >= userStructSize {
return syscall.EIO
}
- if addr < uintptr(registersSize) {
+ if addr < uintptr(ptraceRegistersSize) {
regs := c.ptraceGetRegs()
buf := make([]byte, regs.SizeBytes())
regs.MarshalUnsafe(buf)
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index ada7ac7b8..680d23a9f 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -22,6 +22,8 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -73,6 +75,8 @@ const (
)
// context64 represents an ARM64 context.
+//
+// +stateify savable
type context64 struct {
State
sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
@@ -142,7 +146,7 @@ func (c *context64) SetStack(value uintptr) {
// TLS returns the current TLS pointer.
func (c *context64) TLS() uintptr {
- return uintptr(c.TPValue)
+ return uintptr(c.Regs.TPIDR_EL0)
}
// SetTLS sets the current TLS pointer. Returns false if value is invalid.
@@ -151,7 +155,7 @@ func (c *context64) SetTLS(value uintptr) bool {
return false
}
- c.TPValue = uint64(value)
+ c.Regs.TPIDR_EL0 = uint64(value)
return true
}
@@ -161,14 +165,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
}
// Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
- v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+ v := primitive.Uint64(val)
return &v
}
// Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
- return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+ return uintptr(*val.(*primitive.Uint64))
}
// Width returns the byte width of this architecture.
@@ -272,7 +276,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
}
// PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
return c.Native(0), nil
}
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index dc458b37f..b9405b320 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -31,7 +31,11 @@ import (
)
// Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+ linux.PtraceRegs
+}
// System-related constants for x86.
const (
@@ -311,12 +315,12 @@ func (s *State) ptraceGetRegs() Registers {
return regs
}
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
// PtraceSetRegs implements Context.PtraceSetRegs.
func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
var regs Registers
- buf := make([]byte, registersSize)
+ buf := make([]byte, ptraceRegistersSize)
if _, err := io.ReadFull(src, buf); err != nil {
return 0, err
}
@@ -374,7 +378,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
}
regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
s.Regs = regs
- return registersSize, nil
+ return ptraceRegistersSize, nil
}
// isUserSegmentSelector returns true if the given segment selector specifies a
@@ -543,7 +547,7 @@ const (
func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceGetRegs(dst)
@@ -563,7 +567,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 60c027aab..2727ba08a 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -83,6 +83,7 @@ message ARM64Registers {
uint64 sp = 32;
uint64 pc = 33;
uint64 pstate = 34;
+ uint64 tls = 35;
}
message Registers {
oneof arch {
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 32173aa20..d3e2324a8 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,7 +14,7 @@
package arch
-import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+import "gvisor.dev/gvisor/pkg/marshal"
// Special values for SignalAct.Handler.
const (
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 6fb756f0e..72e07a988 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -17,17 +17,19 @@
package arch
import (
- "encoding/binary"
"math"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/usermem"
)
// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
+//
+// +marshal
type SignalContext64 struct {
R8 uint64
R9 uint64
@@ -68,6 +70,8 @@ const (
)
// UContext64 is equivalent to ucontext_t on 64-bit x86.
+//
+// +marshal
type UContext64 struct {
Flags uint64
Link uint64
@@ -172,12 +176,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
// "... the value (%rsp+8) is always a multiple of 16 (...) when
// control is transferred to the function entry point." - AMD64 ABI
- ucSize := binary.Size(uc)
- if ucSize < 0 {
- // This can only happen if we've screwed up the definition of
- // UContext64.
- panic("can't get size of UContext64")
- }
+ ucSize := uc.SizeBytes()
// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
frameSize := int(st.Arch.Width()) + ucSize + 128
frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
@@ -195,18 +194,18 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
info.FixSignalCodeForUser()
// Set up the stack frame.
- infoAddr, err := st.Push(info)
- if err != nil {
+ if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
return err
}
- ucAddr, err := st.Push(uc)
- if err != nil {
+ infoAddr := st.Bottom
+ if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
return err
}
+ ucAddr := st.Bottom
if act.HasRestorer() {
// Push the restorer return address.
// Note that this doesn't need to be popped.
- if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil {
+ if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil {
return err
}
} else {
@@ -240,11 +239,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
// Copy out the stack frame.
var uc UContext64
- if _, err := st.Pop(&uc); err != nil {
+ if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
var info SignalInfo
- if _, err := st.Pop(&info); err != nil {
+ if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 642c79dda..7fde5d34e 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build arm64
+
package arch
import (
- "encoding/binary"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +26,8 @@ import (
// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
+//
+// +marshal
type SignalContext64 struct {
FaultAddr uint64
Regs [31]uint64
@@ -36,6 +39,7 @@ type SignalContext64 struct {
Reserved [3568]uint8
}
+// +marshal
type aarch64Ctx struct {
Magic uint32
Size uint32
@@ -43,6 +47,8 @@ type aarch64Ctx struct {
// FpsimdContext is equivalent to struct fpsimd_context on arm64
// (arch/arm64/include/uapi/asm/sigcontext.h).
+//
+// +marshal
type FpsimdContext struct {
Head aarch64Ctx
Fpsr uint32
@@ -51,13 +57,15 @@ type FpsimdContext struct {
}
// UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+//
+// +marshal
type UContext64 struct {
Flags uint64
Link uint64
Stack SignalStack
Sigset linux.SignalSet
// glibc uses a 1024-bit sigset_t
- _pad [(1024 - 64) / 8]byte
+ _pad [120]byte // (1024 - 64) / 8 = 120
// sigcontext must be aligned to 16-byte
_pad2 [8]byte
// last for future expansion
@@ -94,11 +102,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
},
Sigset: sigset,
}
-
- ucSize := binary.Size(uc)
- if ucSize < 0 {
- panic("can't get size of UContext64")
- }
+ ucSize := uc.SizeBytes()
// frameSize = ucSize + sizeof(siginfo).
// sizeof(siginfo) == 128.
@@ -119,14 +123,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
info.FixSignalCodeForUser()
// Set up the stack frame.
- infoAddr, err := st.Push(info)
- if err != nil {
+ if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
return err
}
- ucAddr, err := st.Push(uc)
- if err != nil {
+ infoAddr := st.Bottom
+ if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
return err
}
+ ucAddr := st.Bottom
// Set up registers.
c.Regs.Sp = uint64(st.Bottom)
@@ -147,11 +151,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
// Copy out the stack frame.
var uc UContext64
- if _, err := st.Pop(&uc); err != nil {
+ if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
var info SignalInfo
- if _, err := st.Pop(&info); err != nil {
+ if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
return 0, SignalStack{}, err
}
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 0fa738a1d..a1eae98f9 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -17,8 +17,8 @@
package arch
import (
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
const (
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 1108fa0bd..5f06c751d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -15,14 +15,16 @@
package arch
import (
- "encoding/binary"
- "fmt"
-
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/usermem"
)
-// Stack is a simple wrapper around a usermem.IO and an address.
+// Stack is a simple wrapper around a usermem.IO and an address. Stack
+// implements marshal.CopyContext, and marshallable values can be pushed or
+// popped from the stack through the marshal.Marshallable interface.
+//
+// Stack is not thread-safe.
type Stack struct {
// Our arch info.
// We use this for automatic Native conversion of usermem.Addrs during
@@ -34,105 +36,60 @@ type Stack struct {
// Our current stack bottom.
Bottom usermem.Addr
-}
-// Push pushes the given values on to the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
- for _, v := range vals {
-
- // We convert some types to well-known serializable quanities.
- var norm interface{}
-
- // For array types, we will automatically add an appropriate
- // terminal value. This is done simply to make the interface
- // easier to use.
- var term interface{}
-
- switch v.(type) {
- case string:
- norm = []byte(v.(string))
- term = byte(0)
- case []int8, []uint8:
- norm = v
- term = byte(0)
- case []int16, []uint16:
- norm = v
- term = uint16(0)
- case []int32, []uint32:
- norm = v
- term = uint32(0)
- case []int64, []uint64:
- norm = v
- term = uint64(0)
- case []usermem.Addr:
- // Special case: simply push recursively.
- _, err := s.Push(s.Arch.Native(uintptr(0)))
- if err != nil {
- return 0, err
- }
- varr := v.([]usermem.Addr)
- for i := len(varr) - 1; i >= 0; i-- {
- _, err := s.Push(varr[i])
- if err != nil {
- return 0, err
- }
- }
- continue
- case usermem.Addr:
- norm = s.Arch.Native(uintptr(v.(usermem.Addr)))
- default:
- norm = v
- }
+ // Scratch buffer used for marshalling to avoid having to repeatedly
+ // allocate scratch memory.
+ scratchBuf []byte
+}
- if term != nil {
- _, err := s.Push(term)
- if err != nil {
- return 0, err
- }
- }
+// scratchBufLen is the default length of Stack.scratchBuf. The
+// largest structs the stack regularly serializes are arch.SignalInfo
+// and arch.UContext64. We'll set the default size as the larger of
+// the two, arch.UContext64.
+var scratchBufLen = (*UContext64)(nil).SizeBytes()
- c := binary.Size(norm)
- if c < 0 {
- return 0, fmt.Errorf("bad binary.Size for %T", v)
- }
- n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
- if err != nil || c != n {
- return 0, err
- }
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (s *Stack) CopyScratchBuffer(size int) []byte {
+ if len(s.scratchBuf) < size {
+ s.scratchBuf = make([]byte, size)
+ }
+ return s.scratchBuf[:size]
+}
+// StackBottomMagic is the special address callers must past to all stack
+// marshalling operations to cause the src/dst address to be computed based on
+// the current end of the stack.
+const StackBottomMagic = ^usermem.Addr(0) // usermem.Addr(-1)
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes
+// computes an appropriate address based on the current end of the
+// stack. Callers use the sentinel address StackBottomMagic to marshal methods
+// to indicate this.
+func (s *Stack) CopyOutBytes(sentinel usermem.Addr, b []byte) (int, error) {
+ if sentinel != StackBottomMagic {
+ panic("Attempted to copy out to stack with absolute address")
+ }
+ c := len(b)
+ n, err := s.IO.CopyOut(context.Background(), s.Bottom-usermem.Addr(c), b, usermem.IOOpts{})
+ if err == nil && n == c {
s.Bottom -= usermem.Addr(n)
}
-
- return s.Bottom, nil
+ return n, err
}
-// Pop pops the given values off the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
- for _, v := range vals {
-
- vaddr, isVaddr := v.(*usermem.Addr)
-
- var n int
- var err error
- if isVaddr {
- value := s.Arch.Native(uintptr(0))
- n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
- *vaddr = usermem.Addr(s.Arch.Value(value))
- } else {
- n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
- }
- if err != nil {
- return 0, err
- }
-
+// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes
+// an appropriate address based on the current end of the stack. Callers must
+// use the sentinel address StackBottomMagic to marshal methods to indicate
+// this.
+func (s *Stack) CopyInBytes(sentinel usermem.Addr, b []byte) (int, error) {
+ if sentinel != StackBottomMagic {
+ panic("Attempted to copy in from stack with absolute address")
+ }
+ n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{})
+ if err == nil {
s.Bottom += usermem.Addr(n)
}
-
- return s.Bottom, nil
+ return n, err
}
// Align aligns the stack to the given offset.
@@ -142,6 +99,22 @@ func (s *Stack) Align(offset int) {
}
}
+// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null
+// byte at the end. On error, the contents of the stack and the bottom cursor
+// are undefined.
+func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) {
+ // Note: Stack grows up, so write the terminal null byte first.
+ nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs)
+ if err != nil {
+ return 0, err
+ }
+ return n + nNull, nil
+}
+
// StackLayout describes the location of the arguments and environment on the
// stack.
type StackLayout struct {
@@ -177,11 +150,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
l.EnvvEnd = s.Bottom
envAddrs := make([]usermem.Addr, len(env))
for i := len(env) - 1; i >= 0; i-- {
- addr, err := s.Push(env[i])
- if err != nil {
+ if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil {
return StackLayout{}, err
}
- envAddrs[i] = addr
+ envAddrs[i] = s.Bottom
}
l.EnvvStart = s.Bottom
@@ -189,11 +161,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
l.ArgvEnd = s.Bottom
argAddrs := make([]usermem.Addr, len(args))
for i := len(args) - 1; i >= 0; i-- {
- addr, err := s.Push(args[i])
- if err != nil {
+ if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil {
return StackLayout{}, err
}
- argAddrs[i] = addr
+ argAddrs[i] = s.Bottom
}
l.ArgvStart = s.Bottom
@@ -222,26 +193,26 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
auxv = append(auxv, usermem.Addr(a.Key), a.Value)
}
auxv = append(auxv, usermem.Addr(0))
- _, err := s.Push(auxv)
+ _, err := s.pushAddrSliceAndTerminator(auxv)
if err != nil {
return StackLayout{}, err
}
// Push environment.
- _, err = s.Push(envAddrs)
+ _, err = s.pushAddrSliceAndTerminator(envAddrs)
if err != nil {
return StackLayout{}, err
}
// Push args.
- _, err = s.Push(argAddrs)
+ _, err = s.pushAddrSliceAndTerminator(argAddrs)
if err != nil {
return StackLayout{}, err
}
// Push arg count.
- _, err = s.Push(usermem.Addr(len(args)))
- if err != nil {
+ lenP := s.Arch.Native(uintptr(len(args)))
+ if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil {
return StackLayout{}, err
}
diff --git a/pkg/sentry/arch/stack_unsafe.go b/pkg/sentry/arch/stack_unsafe.go
new file mode 100644
index 000000000..a90d297ee
--- /dev/null
+++ b/pkg/sentry/arch/stack_unsafe.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+ "reflect"
+ "runtime"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// pushAddrSliceAndTerminator copies a slices of addresses to the stack, and
+// also pushes an extra null address element at the end of the slice.
+//
+// Internally, we unsafely transmute the slice type from the arch-dependent
+// []usermem.Addr type, to a slice of fixed-sized ints so that we can pass it to
+// go-marshal.
+//
+// On error, the contents of the stack and the bottom cursor are undefined.
+func (s *Stack) pushAddrSliceAndTerminator(src []usermem.Addr) (int, error) {
+ // Note: Stack grows upwards, so push the terminator first.
+ srcHdr := (*reflect.SliceHeader)(unsafe.Pointer(&src))
+ switch s.Arch.Width() {
+ case 8:
+ nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ var dst []uint64
+ dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+ dstHdr.Data = srcHdr.Data
+ dstHdr.Len = srcHdr.Len
+ dstHdr.Cap = srcHdr.Cap
+ n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, dst)
+ // Ensures src doesn't get GCed until we're done using it through dst.
+ runtime.KeepAlive(src)
+ return n + nNull, err
+ case 4:
+ nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0)
+ if err != nil {
+ return 0, err
+ }
+ var dst []uint32
+ dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+ dstHdr.Data = srcHdr.Data
+ dstHdr.Len = srcHdr.Len
+ dstHdr.Cap = srcHdr.Cap
+ n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, dst)
+ // Ensure src doesn't get GCed until we're done using it through dst.
+ runtime.KeepAlive(src)
+ return n + nNull, err
+ default:
+ panic("Unsupported arch width")
+ }
+}
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 8e5658c7a..dfd195a23 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -144,27 +144,7 @@ func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
// RootContext returns a Context that may be used in tests that need root
// credentials. Uses ptrace as the platform.Platform.
func RootContext(tb testing.TB) context.Context {
- return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
-}
-
-// WithCreds returns a copy of ctx carrying creds.
-func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
- return &authContext{ctx, creds}
-}
-
-type authContext struct {
- context.Context
- creds *auth.Credentials
-}
-
-// Value implements context.Context.
-func (ac *authContext) Value(key interface{}) interface{} {
- switch key {
- case auth.CtxCredentials:
- return ac.creds
- default:
- return ac.Context.Value(key)
- }
+ return auth.ContextWithCredentials(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
}
// WithLimitSet returns a copy of ctx carrying l.
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 0c9a62f0d..deaf5fa23 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,15 +16,12 @@ go_library(
],
deps = [
"//pkg/abi/linux",
- "//pkg/context",
"//pkg/fd",
- "//pkg/fspath",
"//pkg/log",
"//pkg/sentry/fdimport",
"//pkg/sentry/fs",
"//pkg/sentry/fs/host",
"//pkg/sentry/fs/user",
- "//pkg/sentry/fsbridge",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
@@ -36,10 +33,8 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/tcpip/link/sniffer",
"//pkg/urpc",
- "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go
index 811f24324..8a500a515 100644
--- a/pkg/sentry/control/logging.go
+++ b/pkg/sentry/control/logging.go
@@ -70,8 +70,8 @@ type LoggingArgs struct {
type Logging struct{}
// Change will change the log level and strace arguments. Although
-// this functions signature requires an error it never acctually
-// return san error. It's required by the URPC interface.
+// this functions signature requires an error it never actually
+// returns an error. It's required by the URPC interface.
// Additionally, it may look odd that this is the only method
// attached to an empty struct but this is also part of how
// URPC dispatches.
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 663e51989..2bf3c45e1 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -49,6 +49,9 @@ type ProfileOpts struct {
// - dump out the stack trace of current go routines.
// sentryctl -pid <pid> pprof-goroutine
type Profile struct {
+ // Kernel is the kernel under profile. It's immutable.
+ Kernel *kernel.Kernel
+
// mu protects the fields below.
mu sync.Mutex
@@ -57,9 +60,6 @@ type Profile struct {
// traceFile is the current execution trace output file.
traceFile *fd.FD
-
- // Kernel is the kernel under profile.
- Kernel *kernel.Kernel
}
// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 8767430b7..1d88db12f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,15 +23,12 @@ import (
"text/tabwriter"
"time"
- "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/fs/user"
- "gvisor.dev/gvisor/pkg/sentry/fsbridge"
hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -39,7 +36,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/urpc"
)
@@ -107,6 +103,9 @@ type ExecArgs struct {
// String prints the arguments as a string.
func (args ExecArgs) String() string {
+ if len(args.Argv) == 0 {
+ return args.Filename
+ }
a := make([]string, len(args.Argv))
copy(a, args.Argv)
if args.Filename != "" {
@@ -140,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID
func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
// Import file descriptors.
fdTable := proc.Kernel.NewFDTable()
- defer fdTable.DecRef()
creds := auth.NewUserCredentials(
args.KUID,
@@ -178,60 +176,44 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
initArgs.MountNamespaceVFS2.IncRef()
}
ctx := initArgs.NewContext(proc.Kernel)
+ defer fdTable.DecRef(ctx)
+
+ if kernel.VFS2Enabled {
+ // Get the full path to the filename from the PATH env variable.
+ if initArgs.MountNamespaceVFS2 == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ //
+ // Add a reference to the namespace, which is transferred to the new process.
+ initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+ initArgs.MountNamespaceVFS2.IncRef()
+ }
+ } else {
+ if initArgs.MountNamespace == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
- if initArgs.Filename == "" {
- if kernel.VFS2Enabled {
- // Get the full path to the filename from the PATH env variable.
- if initArgs.MountNamespaceVFS2 == nil {
- // Set initArgs so that 'ctx' returns the namespace.
- //
- // MountNamespaceVFS2 adds a reference to the namespace, which is
- // transferred to the new process.
- initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
- }
- file, err := getExecutableFD(ctx, creds, proc.Kernel.VFS(), initArgs.MountNamespaceVFS2, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0])
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in environment %v: %v", initArgs.Argv[0], initArgs.Envv, err)
- }
- initArgs.File = fsbridge.NewVFSFile(file)
- } else {
- if initArgs.MountNamespace == nil {
- // Set initArgs so that 'ctx' returns the namespace.
- initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
-
- // initArgs must hold a reference on MountNamespace, which will
- // be donated to the new process in CreateProcess.
- initArgs.MountNamespace.IncRef()
- }
- f, err := user.ResolveExecutablePath(ctx, creds, initArgs.MountNamespace, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0])
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], initArgs.Envv, err)
- }
- initArgs.Filename = f
+ // initArgs must hold a reference on MountNamespace, which will
+ // be donated to the new process in CreateProcess.
+ initArgs.MountNamespace.IncRef()
}
}
+ resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
+ if err != nil {
+ return nil, 0, nil, nil, err
+ }
+ initArgs.Filename = resolved
- fds := make([]int, len(args.FilePayload.Files))
- for i, file := range args.FilePayload.Files {
- if kernel.VFS2Enabled {
- // Need to dup to remove ownership from os.File.
- dup, err := unix.Dup(int(file.Fd()))
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
- }
- fds[i] = dup
- } else {
- // VFS1 dups the file on import.
- fds[i] = int(file.Fd())
- }
+ fds, err := fd.NewFromFiles(args.Files)
+ if err != nil {
+ return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
}
+ defer func() {
+ for _, fd := range fds {
+ _ = fd.Close()
+ }
+ }()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
if err != nil {
- if kernel.VFS2Enabled {
- for _, fd := range fds {
- unix.Close(fd)
- }
- }
return nil, 0, nil, nil, err
}
@@ -422,31 +404,3 @@ func ttyName(tty *kernel.TTY) string {
}
return fmt.Sprintf("pts/%d", tty.Index)
}
-
-// getExecutableFD resolves the given executable name and returns a
-// vfs.FileDescription for the executable file.
-func getExecutableFD(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, mns *vfs.MountNamespace, envv []string, wd, name string) (*vfs.FileDescription, error) {
- path, err := user.ResolveExecutablePathVFS2(ctx, creds, mns, envv, wd, name)
- if err != nil {
- return nil, err
- }
-
- root := vfs.RootFromContext(ctx)
- defer root.DecRef()
-
- pop := vfs.PathOperation{
- Root: root,
- Start: root, // binPath is absolute, Start can be anything.
- Path: fspath.Parse(path),
- FollowFinalSymlink: true,
- }
- opts := &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- FileExec: true,
- }
- f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
- if err == syserror.ENOENT || err == syserror.EACCES {
- return nil, nil
- }
- return f, err
-}
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index 41feeffe3..d800f2c85 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -69,5 +69,5 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
s.Kernel.Kill(kernel.ExitStatus{})
},
}
- return saveOpts.Save(s.Kernel, s.Watchdog)
+ return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog)
}
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 69e71e322..6ca9dc79f 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -188,6 +188,9 @@ type MultiDevice struct {
// String stringifies MultiDevice.
func (m *MultiDevice) String() string {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
buf := bytes.NewBuffer(nil)
buf.WriteString("cache{")
for k, v := range m.cache {
@@ -253,7 +256,7 @@ func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool {
}
if k, exists := m.rcache[value]; exists && k != key {
// Should never happen.
- panic("MultiDevice's caches are inconsistent")
+ panic(fmt.Sprintf("MultiDevice's caches are inconsistent, current: %+v, previous: %+v", key, k))
}
// Cache value at key.
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
index abe58f818..4c8604d58 100644
--- a/pkg/sentry/devices/memdev/BUILD
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -18,9 +18,10 @@ go_library(
"//pkg/rand",
"//pkg/safemem",
"//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/mm",
- "//pkg/sentry/pgalloc",
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index c7e197691..fece3e762 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -24,6 +24,8 @@ import (
const fullDevMinor = 7
// fullDevice implements vfs.Device for /dev/full.
+//
+// +stateify savable
type fullDevice struct{}
// Open implements vfs.Device.Open.
@@ -38,14 +40,17 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// fullFD implements vfs.FileDescriptionImpl for /dev/full.
+//
+// +stateify savable
type fullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *fullFD) Release() {
+func (fd *fullFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 33d060d02..ff5837747 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -25,6 +25,8 @@ import (
const nullDevMinor = 3
// nullDevice implements vfs.Device for /dev/null.
+//
+// +stateify savable
type nullDevice struct{}
// Open implements vfs.Device.Open.
@@ -39,14 +41,17 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// nullFD implements vfs.FileDescriptionImpl for /dev/null.
+//
+// +stateify savable
type nullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nullFD) Release() {
+func (fd *nullFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index acfa23149..ac943e3ba 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -30,6 +30,8 @@ const (
)
// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+//
+// +stateify savable
type randomDevice struct{}
// Open implements vfs.Device.Open.
@@ -44,10 +46,13 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry,
}
// randomFD implements vfs.FileDescriptionImpl for /dev/random.
+//
+// +stateify savable
type randomFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// off is the "file offset". off is accessed using atomic memory
// operations.
@@ -55,7 +60,7 @@ type randomFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *randomFD) Release() {
+func (fd *randomFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 3b1372b9e..1929e41cd 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -16,9 +16,10 @@ package memdev
import (
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -26,6 +27,8 @@ import (
const zeroDevMinor = 5
// zeroDevice implements vfs.Device for /dev/zero.
+//
+// +stateify savable
type zeroDevice struct{}
// Open implements vfs.Device.Open.
@@ -40,14 +43,17 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+//
+// +stateify savable
type zeroFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *zeroFD) Release() {
+func (fd *zeroFD) Release(context.Context) {
// noop
}
@@ -78,11 +84,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64,
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+ if opts.Private || !opts.MaxPerms.Write {
+ // This mapping will never permit writing to the "underlying file" (in
+ // Linux terms, it isn't VM_SHARED), so implement it as an anonymous
+ // mapping, but back it with fd; this is what Linux does, and is
+ // actually application-visible because the resulting VMA will show up
+ // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
+ // "/dev/zero (deleted)".
+ opts.Offset = 0
+ opts.MappingIdentity = &fd.vfsfd
+ opts.MappingIdentity.IncRef()
+ return nil
+ }
+ tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
if err != nil {
return err
}
- opts.MappingIdentity = m
- opts.Mappable = m
- return nil
+ defer tmpfsFD.DecRef(ctx)
+ return tmpfsFD.ConfigureMMap(ctx, opts)
}
diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD
new file mode 100644
index 000000000..b4b6ca38a
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "ttydev",
+ srcs = ["ttydev.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
new file mode 100644
index 000000000..a287c65ca
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ttydev implements an unopenable vfs.Device for /dev/tty.
+package ttydev
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ // See drivers/tty/tty_io.c:tty_init().
+ ttyDevMinor = 0
+ consoleDevMinor = 1
+)
+
+// ttyDevice implements vfs.Device for /dev/tty.
+//
+// +stateify savable
+type ttyDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ return nil, syserror.EIO
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ return vfsObj.RegisterDevice(vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, ttyDevice{}, &vfs.RegisterDeviceOptions{
+ GroupName: "tty",
+ })
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+ return dev.CreateDeviceFile(ctx, "tty", vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD
new file mode 100644
index 000000000..14a8bf9cd
--- /dev/null
+++ b/pkg/sentry/devices/tundev/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "tundev",
+ srcs = ["tundev.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/inet",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/socket/netstack",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/tcpip/link/tun",
+ "//pkg/tcpip/network/arp",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
new file mode 100644
index 000000000..ff5d49fbd
--- /dev/null
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -0,0 +1,191 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tundev implements the /dev/net/tun device.
+package tundev
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip/link/tun"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ netTunDevMajor = 10
+ netTunDevMinor = 200
+)
+
+// tunDevice implements vfs.Device for /dev/net/tun.
+//
+// +stateify savable
+type tunDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd := &tunFD{}
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun.
+//
+// +stateify savable
+type tunFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ device tun.Device
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ request := args[1].Uint()
+ data := args[2].Pointer()
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("Ioctl should be called from a task context")
+ }
+
+ switch request {
+ case linux.TUNSETIFF:
+ if !t.HasCapability(linux.CAP_NET_ADMIN) {
+ return 0, syserror.EPERM
+ }
+ stack, ok := t.NetworkContext().(*netstack.Stack)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+
+ var req linux.IFReq
+ if _, err := req.CopyIn(t, data); err != nil {
+ return 0, err
+ }
+ flags := usermem.ByteOrder.Uint16(req.Data[:])
+ created, err := fd.device.SetIff(stack.Stack, req.Name(), flags)
+ if err == nil && created {
+ // Always start with an ARP address for interfaces so they can handle ARP
+ // packets.
+ nicID := fd.device.NICID()
+ if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+ }
+ }
+ return 0, err
+
+ case linux.TUNGETIFF:
+ var req linux.IFReq
+
+ copy(req.IFName[:], fd.device.Name())
+
+ // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+ // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+ flags := fd.device.Flags() | linux.IFF_NOFILTER
+ usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+ _, err := req.CopyOut(t, data)
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *tunFD) Release(ctx context.Context) {
+ fd.device.Release(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return fd.Read(ctx, dst, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ data, err := fd.device.Read()
+ if err != nil {
+ return 0, err
+ }
+ n, err := dst.CopyOut(ctx, data)
+ if n > 0 && n < len(data) {
+ // Not an error for partial copying. Packet truncated.
+ err = nil
+ }
+ return int64(n), err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return fd.Write(ctx, src, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ data := make([]byte, src.NumBytes())
+ if _, err := src.CopyIn(ctx, data); err != nil {
+ return 0, err
+ }
+ return fd.device.Write(data)
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fd *tunFD) EventUnregister(e *waiter.Entry) {
+ fd.device.EventUnregister(e)
+}
+
+// IsNetTunSupported returns whether /dev/net/tun device is supported for s.
+func IsNetTunSupported(s inet.Stack) bool {
+ _, ok := s.(*netstack.Stack)
+ return ok
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{})
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+ return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
index 5e41ceb4e..6b4f8b0ed 100644
--- a/pkg/sentry/fdimport/BUILD
+++ b/pkg/sentry/fdimport/BUILD
@@ -10,6 +10,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/context",
+ "//pkg/fd",
"//pkg/sentry/fs",
"//pkg/sentry/fs/host",
"//pkg/sentry/fsimpl/host",
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index a4199f9e9..314661475 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -15,7 +15,10 @@
package fdimport
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
@@ -25,8 +28,9 @@ import (
// Import imports a slice of FDs into the given FDTable. If console is true,
// sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
-// stderr. Upon success, Import takes ownership of all FDs.
-func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+// stderr. Used FDs are either closed or released. It's safe for the caller to
+// close any remaining files upon return.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if kernel.VFS2Enabled {
ttyFile, err := importVFS2(ctx, fdTable, console, fds)
return nil, ttyFile, err
@@ -35,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in
return ttyFile, nil, err
}
-func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) {
var ttyFile *fs.File
for appFD, hostFD := range fds {
var appFile *fs.File
@@ -44,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
// Import the file as a host TTY file.
if ttyFile == nil {
var err error
- appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+ appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */)
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
+ _ = hostFD.Close() // FD is dup'd i ImportFile.
// Remember this in the TTY file, as we will
// use it for the other stdio FDs.
@@ -63,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
} else {
// Import the file as a regular host file.
var err error
- appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+ appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */)
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
+ _ = hostFD.Close() // FD is dup'd i ImportFile.
}
// Add the file to the FD map.
@@ -82,8 +88,11 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
return ttyFile.FileOperations.(*host.TTYFileOperations), nil
}
-func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) {
k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ return nil, fmt.Errorf("cannot find kernel from context")
+ }
var ttyFile *vfs.FileDescription
for appFD, hostFD := range stdioFDs {
@@ -93,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
// Import the file as a host TTY file.
if ttyFile == nil {
var err error
- appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+ appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */)
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
+ hostFD.Release() // FD is transfered to host FD.
// Remember this in the TTY file, as we will use it for the other stdio
// FDs.
@@ -110,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
}
} else {
var err error
- appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+ appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */)
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
+ hostFD.Release() // FD is transfered to host FD.
}
if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ab1424c95..ff2fe6712 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error {
// leave the upper filesystem filled with any number of parent directories
// but the upper filesystem will never be in an inconsistent state.
//
-// Preconditions:
-// - d.Inode.overlay is non-nil.
+// Preconditions: d.Inode.overlay is non-nil.
func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
for {
// Did we race with another copy up or does there
@@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
// Returns a generic error on failure.
//
// Preconditions:
-// - parent.Inode.overlay.upper must be non-nil.
-// - next.Inode.overlay.copyMu must be locked writable.
-// - next.Inode.overlay.lower must be non-nil.
-// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+// * parent.Inode.overlay.upper must be non-nil.
+// * next.Inode.overlay.copyMu must be locked writable.
+// * next.Inode.overlay.lower must be non-nil.
+// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
// or Symlink.
-// - upper filesystem must support setting file ownership and timestamps.
+// * upper filesystem must support setting file ownership and timestamps.
func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
// Extract the attributes of the file we wish to copy.
attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
@@ -201,7 +200,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
parentUpper := parent.Inode.overlay.upper
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
// Create the file in the upper filesystem and get an Inode for it.
@@ -212,7 +211,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
log.Warningf("copy up failed to create file: %v", err)
return syserror.EIO
}
- defer childFile.DecRef()
+ defer childFile.DecRef(ctx)
childUpperInode = childFile.Dirent.Inode
case Directory:
@@ -226,7 +225,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
cleanupUpper(ctx, parentUpper, next.name, werr)
return syserror.EIO
}
- defer childUpper.DecRef()
+ defer childUpper.DecRef(ctx)
childUpperInode = childUpper.Inode
case Symlink:
@@ -246,7 +245,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
cleanupUpper(ctx, parentUpper, next.name, werr)
return syserror.EIO
}
- defer childUpper.DecRef()
+ defer childUpper.DecRef(ctx)
childUpperInode = childUpper.Inode
default:
@@ -352,14 +351,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in
if err != nil {
return err
}
- defer upperFile.DecRef()
+ defer upperFile.DecRef(ctx)
// Get a handle to the lower filesystem, which we will read from.
lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true})
if err != nil {
return err
}
- defer lowerFile.DecRef()
+ defer lowerFile.DecRef(ctx)
// Use a buffer pool to minimize allocations.
buf := copyUpBuffers.Get().([]byte)
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 91792d9fe..c7a11eec1 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
if err != nil {
t.Fatalf("failed to create file %q: %v", name, err)
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
relname, _ := f.Dirent.FullName(lowerRoot)
@@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
if err != nil {
t.Fatalf("failed to find %q: %v", f.name, err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
if err != nil {
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9379a4d7b..6b7b451b8 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -34,6 +34,7 @@ go_library(
"//pkg/sentry/socket/netstack",
"//pkg/syserror",
"//pkg/tcpip/link/tun",
+ "//pkg/tcpip/network/arp",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index dc7ad075a..19ffdec47 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -15,6 +15,8 @@
package dev
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -25,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -60,7 +63,7 @@ func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod
}
// GetFile implements fs.InodeOperations.GetFile.
-func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (*netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
}
@@ -80,21 +83,22 @@ type netTunFileOperations struct {
var _ fs.FileOperations = (*netTunFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (fops *netTunFileOperations) Release() {
- fops.device.Release()
+func (n *netTunFileOperations) Release(ctx context.Context) {
+ n.device.Release(ctx)
}
// Ioctl implements fs.FileOperations.Ioctl.
-func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
request := args[1].Uint()
data := args[2].Pointer()
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("Ioctl should be called from a task context")
+ }
+
switch request {
case linux.TUNSETIFF:
- t := kernel.TaskFromContext(ctx)
- if t == nil {
- panic("Ioctl should be called from a task context")
- }
if !t.HasCapability(linux.CAP_NET_ADMIN) {
return 0, syserror.EPERM
}
@@ -104,27 +108,32 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
}
var req linux.IFReq
- if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := req.CopyIn(t, data); err != nil {
return 0, err
}
flags := usermem.ByteOrder.Uint16(req.Data[:])
- return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+ created, err := n.device.SetIff(stack.Stack, req.Name(), flags)
+ if err == nil && created {
+ // Always start with an ARP address for interfaces so they can handle ARP
+ // packets.
+ nicID := n.device.NICID()
+ if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+ }
+ }
+ return 0, err
case linux.TUNGETIFF:
var req linux.IFReq
- copy(req.IFName[:], fops.device.Name())
+ copy(req.IFName[:], n.device.Name())
// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
- flags := fops.device.Flags() | linux.IFF_NOFILTER
+ flags := n.device.Flags() | linux.IFF_NOFILTER
usermem.ByteOrder.PutUint16(req.Data[:], flags)
- _, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := req.CopyOut(t, data)
return 0, err
default:
@@ -133,41 +142,41 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
}
// Write implements fs.FileOperations.Write.
-func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+func (n *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
data := make([]byte, src.NumBytes())
if _, err := src.CopyIn(ctx, data); err != nil {
return 0, err
}
- return fops.device.Write(data)
+ return n.device.Write(data)
}
// Read implements fs.FileOperations.Read.
-func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- data, err := fops.device.Read()
+func (n *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ data, err := n.device.Read()
if err != nil {
return 0, err
}
- n, err := dst.CopyOut(ctx, data)
- if n > 0 && n < len(data) {
+ bytesCopied, err := dst.CopyOut(ctx, data)
+ if bytesCopied > 0 && bytesCopied < len(data) {
// Not an error for partial copying. Packet truncated.
err = nil
}
- return int64(n), err
+ return int64(bytesCopied), err
}
// Readiness implements watier.Waitable.Readiness.
-func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
- return fops.device.Readiness(mask)
+func (n *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return n.device.Readiness(mask)
}
// EventRegister implements watier.Waitable.EventRegister.
-func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- fops.device.EventRegister(e, mask)
+func (n *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ n.device.EventRegister(e, mask)
}
// EventUnregister implements watier.Waitable.EventUnregister.
-func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
- fops.device.EventUnregister(e)
+func (n *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+ n.device.EventUnregister(e)
}
// isNetTunSupported returns whether /dev/net/tun device is supported for s.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 65be12175..00c526b03 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) {
for _, w := range d.children {
if child := w.Get(); child != nil {
child.(*Dirent).SyncAll(ctx)
- child.DecRef()
+ child.DecRef(ctx)
}
}
}
@@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
// Inode.Lookup, otherwise walk will keep d.mu locked.
//
// Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
if !IsDir(d.Inode.StableAttr) {
return nil, syscall.ENOTDIR
@@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// which don't hold a hard reference on their parent (their parent holds a
// hard reference on them, and they contain virtually no state). But this is
// good house-keeping.
- child.DecRef()
+ child.DecRef(ctx)
return nil, syscall.ENOENT
}
@@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// their pins on the child. Inotify doesn't properly support filesystems that
// revalidate dirents (since watches are lost on revalidation), but if we fail
// to unpin the watches child will never be GCed.
- cd.Inode.Watches.Unpin(cd)
+ cd.Inode.Watches.Unpin(ctx, cd)
// This child needs to be revalidated, fallthrough to unhash it. Make sure
// to not leak a reference from Get().
//
// Note that previous lookups may still have a reference to this stale child;
// this can't be helped, but we can ensure that *new* lookups are up-to-date.
- child.DecRef()
+ child.DecRef(ctx)
}
// Either our weak reference expired or we need to revalidate it. Unhash child first, we're
// about to replace it.
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
@@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// There are active references to the existing child, prefer it to the one we
// retrieved from Lookup. Likely the Lookup happened very close to the insertion
// of child, so considering one stale over the other is fairly arbitrary.
- c.DecRef()
+ c.DecRef(ctx)
// The child that was installed could be negative.
if cd.IsNegative() {
// If so, don't leak a reference and short circuit.
- child.DecRef()
+ child.DecRef(ctx)
return nil, syscall.ENOENT
}
@@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
// we looked up.
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Give the looked up child a parent. We cannot kick out entries, since we just checked above
@@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
// exists returns true if name exists in relation to d.
//
// Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
child, err := d.walk(ctx, root, name, false /* may unlock */)
if err != nil {
@@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
return false
}
// Child exists.
- child.DecRef()
+ child.DecRef(ctx)
return true
}
@@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
}
child := file.Dirent
- d.finishCreate(child, name)
+ d.finishCreate(ctx, child, name)
// Return the reference and the new file. When the last reference to
// the file is dropped, file.Dirent may no longer be cached.
@@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
// finishCreate validates the created file, adds it as a child of this dirent,
// and notifies any watchers.
-func (d *Dirent) finishCreate(child *Dirent, name string) {
+func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) {
// Sanity check c, its name must be consistent.
if child.name != name {
panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
@@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) {
panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
}
// Don't leak a reference.
- old.DecRef()
+ old.DecRef(ctx)
// Drop d's reference.
- old.DecRef()
+ old.DecRef(ctx)
}
// Finally drop the useless weak reference on the floor.
- w.Drop()
+ w.Drop(ctx)
}
d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
@@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
}
// Don't leak a reference.
- old.DecRef()
+ old.DecRef(ctx)
// Drop d's reference.
- old.DecRef()
+ old.DecRef(ctx)
}
// Unhash the negative Dirent, name needs to exist now.
delete(d.children, name)
// Finally drop the useless weak reference on the floor.
- w.Drop()
+ w.Drop(ctx)
}
// Execute the create operation.
@@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans
if e != nil {
return e
}
- d.finishCreate(childDir, name)
+ d.finishCreate(ctx, childDir, name)
return nil
})
if err == syscall.EEXIST {
@@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
// references to children.
//
// Preconditions: d.mu must be held.
-func (d *Dirent) flush() {
+func (d *Dirent) flush(ctx context.Context) {
expired := make(map[string]*refs.WeakRef)
for n, w := range d.children {
// Call flush recursively on each child before removing our
@@ -912,7 +912,7 @@ func (d *Dirent) flush() {
if !cd.IsNegative() {
// Flush the child.
cd.mu.Lock()
- cd.flush()
+ cd.flush(ctx)
cd.mu.Unlock()
// Allow the file system to drop extra references on child.
@@ -920,13 +920,13 @@ func (d *Dirent) flush() {
}
// Don't leak a reference.
- child.DecRef()
+ child.DecRef(ctx)
}
// Check if the child dirent is closed, and mark it as expired if it is.
// We must call w.Get() again here, since the child could have been closed
// by the calls to flush() and cache.Remove() in the above if-block.
if child := w.Get(); child != nil {
- child.DecRef()
+ child.DecRef(ctx)
} else {
expired[n] = w
}
@@ -935,7 +935,7 @@ func (d *Dirent) flush() {
// Remove expired entries.
for n, w := range expired {
delete(d.children, n)
- w.Drop()
+ w.Drop(ctx)
}
}
@@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err
if !ok {
panic("mount must mount over an existing dirent")
}
- weakRef.Drop()
+ weakRef.Drop(ctx)
// Note that even though `d` is now hidden, it still holds a reference
// to its parent.
@@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
if !ok {
panic("mount must mount over an existing dirent")
}
- weakRef.Drop()
+ weakRef.Drop(ctx)
// d is not reachable anymore, and hence not mounted anymore.
d.mounted = false
// Drop mount reference.
- d.DecRef()
+ d.DecRef(ctx)
return nil
}
@@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
// Child does not exist.
return err
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
// Remove cannot remove directories.
if IsDir(child.Inode.StableAttr) {
@@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Allow the file system to drop extra references on child.
@@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
// inode may have other links. If this was the last link, the events for the
// watch removal will be queued by the inode destructor.
child.Inode.Watches.MarkUnlinked()
- child.Inode.Watches.Unpin(child)
+ child.Inode.Watches.Unpin(ctx, child)
d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
return nil
@@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
// Child does not exist.
return err
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
// RemoveDirectory can only remove directories.
if !IsDir(child.Inode.StableAttr) {
@@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Allow the file system to drop extra references on child.
@@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
// Finally, let inotify know the child is being unlinked. Drop any extra
// refs from inotify to this child dirent.
child.Inode.Watches.MarkUnlinked()
- child.Inode.Watches.Unpin(child)
+ child.Inode.Watches.Unpin(ctx, child)
d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
return nil
}
// destroy closes this node and all children.
-func (d *Dirent) destroy() {
+func (d *Dirent) destroy(ctx context.Context) {
if d.IsNegative() {
// Nothing to tear-down and no parent references to drop, since a negative
// Dirent does not take a references on its parent, has no Inode and no children.
@@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() {
if c.(*Dirent).IsNegative() {
// The parent holds both weak and strong refs in the case of
// negative dirents.
- c.DecRef()
+ c.DecRef(ctx)
}
// Drop the reference we just acquired in WeakRef.Get.
- c.DecRef()
+ c.DecRef(ctx)
}
- w.Drop()
+ w.Drop(ctx)
}
d.children = nil
allDirents.remove(d)
// Drop our reference to the Inode.
- d.Inode.DecRef()
+ d.Inode.DecRef(ctx)
// Allow the Dirent to be GC'ed after this point, since the Inode may still
// be referenced after the Dirent is destroyed (for instance by filesystem
@@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() {
// Drop the reference we have on our parent if we took one. renameMu doesn't need to be
// held because d can't be reparented without any references to it left.
if d.parent != nil {
- d.parent.DecRef()
+ d.parent.DecRef(ctx)
}
}
@@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool {
// DecRef decreases the Dirent's refcount and drops its reference on its mount.
//
// DecRef implements RefCounter.DecRef with destructor d.destroy.
-func (d *Dirent) DecRef() {
+func (d *Dirent) DecRef(ctx context.Context) {
if d.Inode != nil {
// Keep mount around, since DecRef may destroy d.Inode.
msrc := d.Inode.MountSource
- d.DecRefWithDestructor(d.destroy)
+ d.DecRefWithDestructor(ctx, d.destroy)
msrc.DecDirentRefs()
} else {
- d.DecRefWithDestructor(d.destroy)
+ d.DecRefWithDestructor(ctx, d.destroy)
}
}
@@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error
if err != nil {
return err
}
- defer victim.DecRef()
+ defer victim.DecRef(ctx)
return d.mayDelete(ctx, victim)
}
@@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
if err != nil {
return err
}
- defer renamed.DecRef()
+ defer renamed.DecRef(ctx)
// Check that the renamed dirent is deletable.
if err := oldParent.mayDelete(ctx, renamed); err != nil {
@@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Check that we can delete replaced.
if err := newParent.mayDelete(ctx, replaced); err != nil {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return err
}
// Target should not be an ancestor of source.
if oldParent.descendantOf(replaced) {
- replaced.DecRef()
+ replaced.DecRef(ctx)
// Note that Linux returns EINVAL if the source is an
// ancestor of target, but ENOTEMPTY if the target is
@@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Check that replaced is not a mount point.
if replaced.isMountPointLocked() {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.EBUSY
}
@@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
oldIsDir := IsDir(renamed.Inode.StableAttr)
newIsDir := IsDir(replaced.Inode.StableAttr)
if !newIsDir && oldIsDir {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.ENOTDIR
}
if !oldIsDir && newIsDir {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.EISDIR
}
@@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// open across renames is currently broken for multiple
// reasons, so we flush all references on the replaced node and
// its children.
- replaced.Inode.Watches.Unpin(replaced)
+ replaced.Inode.Watches.Unpin(ctx, replaced)
replaced.mu.Lock()
- replaced.flush()
+ replaced.flush(ctx)
replaced.mu.Unlock()
// Done with replaced.
- replaced.DecRef()
+ replaced.DecRef(ctx)
}
if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil {
@@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// can't destroy oldParent (and try to retake its lock) because
// Rename's caller must be holding a reference.
newParent.IncRef()
- oldParent.DecRef()
+ oldParent.DecRef(ctx)
}
if w, ok := newParent.children[newName]; ok {
- w.Drop()
+ w.Drop(ctx)
delete(newParent.children, newName)
}
if w, ok := oldParent.children[oldName]; ok {
- w.Drop()
+ w.Drop(ctx)
delete(oldParent.children, oldName)
}
@@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Same as replaced.flush above.
renamed.mu.Lock()
- renamed.flush()
+ renamed.flush(ctx)
renamed.mu.Unlock()
return nil
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 33de32c69..7d9dd717e 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -17,6 +17,7 @@ package fs
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) {
panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
}
c.list.Remove(d)
- d.DecRef()
+ d.DecRef(context.Background())
c.currentSize--
if c.limit != nil {
c.limit.dec()
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 98d69c6f2..176b894ba 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) {
t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1)
}
- d.DecRef()
+ d.DecRef(ctx)
if got := root.ReadRefs(); got != 1 {
t.Fatalf("root has a ref count of %d, want %d", got, 1)
@@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) {
t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
}
- root.flush()
+ root.flush(ctx)
if got := len(root.children); got != 0 {
t.Fatalf("root has %d children, want %d", got, 0)
@@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) {
t.Fatalf("child has a ref count of %d, want %d", got, 2)
}
- child.DecRef()
+ child.DecRef(ctx)
if got := child.(*Dirent).ReadRefs(); got != 1 {
t.Fatalf("child has a ref count of %d, want %d", got, 1)
@@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) {
t.Fatalf("root has %d children, want %d", got, 1)
}
- root.DecRef()
+ root.DecRef(ctx)
if got := root.ReadRefs(); got != 0 {
t.Fatalf("root has a ref count of %d, want %d", got, 0)
@@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) {
t.Fatalf("dirent has a ref count of %d, want %d", got, 1)
}
- d.DecRef()
+ d.DecRef(ctx)
- test.root.flush()
+ test.root.flush(ctx)
if got := len(test.root.children); got != 0 {
t.Errorf("root has %d children, want %d", got, 0)
@@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) {
t.Fatalf("Rename got error %v, want nil", err)
}
- oldParent.flush()
- newParent.flush()
+ oldParent.flush(ctx)
+ newParent.flush(ctx)
// Expect to have only active references.
if got := renamed.ReadRefs(); got != 1 {
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index f623d6c0e..67a35f0b2 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -18,6 +18,7 @@ import (
"fmt"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
)
@@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent {
for name, w := range d.children {
if rc := w.Get(); rc != nil {
// Drop the reference count obtain in w.Get()
- rc.DecRef()
+ rc.DecRef(context.Background())
cd := rc.(*Dirent)
if cd.IsNegative() {
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 9fce177ad..b99199798 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even
}
// Release implements fs.FileOperations.Release.
-func (p *pipeOperations) Release() {
+func (p *pipeOperations) Release(context.Context) {
fdnotifier.RemoveFD(int32(p.file.FD()))
p.file.Close()
p.file = nil
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index e556da48a..b9cec4b13 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) {
// Cleanup the state of the pipe, and remove the fd from the
// fdnotifier. Sadly this needed to maintain the correctness
// of other tests because the fdnotifier is global.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
continue
}
@@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) {
}
if pipeOps != nil {
// Same as above.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
}
}
@@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) {
pipeOps, err := Open(ctx, opener, flags)
if pipeOps != nil {
// Same as TestTryOpen.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
// Check that the partner opened the file successfully.
@@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) {
ctx := contexttest.Context(t)
pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true})
if pipeOps != nil {
- pipeOps.Release()
+ pipeOps.Release(ctx)
t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY)
}
if err != syserror.ErrWouldBlock {
@@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) {
if pipeOps == nil {
t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY)
}
- defer pipeOps.Release()
+ defer pipeOps.Release(ctx)
if err != nil {
t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err)
@@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) {
f := <-fdchan
if f < 0 {
t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f)
- pipeOps.Release()
+ pipeOps.Release(ctx)
continue
}
if test.hangupSelf {
// Hangup self and assert that our partner got the expected hangup
// error.
- pipeOps.Release()
+ pipeOps.Release(ctx)
if test.flags.Read {
// Partner is writer.
@@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) {
} else {
// Hangup our partner and expect us to get the hangup error.
syscall.Close(f)
- defer pipeOps.Release()
+ defer pipeOps.Release(ctx)
if test.flags.Read {
assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file)
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index a0082ecca..1c9e82562 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) {
}
f := fd.New(gfd)
- p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer)
+ ctx := contexttest.Context(t)
+ p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer)
if p != nil {
// This is necessary to remove the fd from the global fd notifier.
- defer p.Release()
+ defer p.Release(ctx)
} else {
// If there is no p to DecRef on, because newPipeOperations failed, then the
// file still needs to be closed.
@@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) {
syscall.Close(fds[1])
// Test the read end, but it doesn't really matter which.
- p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil)
+ ctx := contexttest.Context(t)
+ p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil)
if err != nil {
f.Close()
t.Fatalf("newPipeOperations got error %v, want nil", err)
}
// Drop our only reference, which should trigger the destructor.
- p.Release()
+ p.Release(ctx)
if fdnotifier.HasFD(int32(fds[0])) {
t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0])
@@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) {
if err != nil {
t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err)
}
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p)
@@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) {
rfile.Close()
t.Fatalf("newPipeOperations got error %v, want nil", err)
}
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
@@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) {
}
// Don't forget to remove the fd from the fd notifier. Otherwise other tests will
// likely be borked, because it's global :(
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
@@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) {
}
// Don't forget to remove the fd from the fd notifier. Otherwise other tests
// will likely be borked, because it's global :(
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 846252c89..72ea70fcf 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper
}
// DecRef destroys the File when it is no longer referenced.
-func (f *File) DecRef() {
- f.DecRefWithDestructor(func() {
+func (f *File) DecRef(ctx context.Context) {
+ f.DecRefWithDestructor(ctx, func(context.Context) {
// Drop BSD style locks.
lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
- f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng)
+ f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng)
// Release resources held by the FileOperations.
- f.FileOperations.Release()
+ f.FileOperations.Release(ctx)
// Release a reference on the Dirent.
- f.Dirent.DecRef()
+ f.Dirent.DecRef(ctx)
// Only unregister if we are currently registered. There is nothing
// to register if f.async is nil (this happens when async mode is
@@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
if !f.mu.Lock(ctx) {
return 0, syserror.ErrInterrupted
}
-
unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
// Handle append mode.
if f.Flags().Append {
@@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
// offset."
unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
defer unlockAppendMu()
-
if f.Flags().Append {
if err := f.offsetForAppend(ctx, &offset); err != nil {
return 0, err
@@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
return f.FileOperations.Write(ctx, f, src, offset)
}
-// offsetForAppend sets the given offset to the end of the file.
+// offsetForAppend atomically sets the given offset to the end of the file.
//
-// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
+// writing.
func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
if err != nil {
@@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
}
// Update the offset.
- *offset = uattr.Size
+ atomic.StoreInt64(offset, uattr.Size)
return nil
}
@@ -461,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
func (f *File) MappedName(ctx context.Context) string {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
name, _ := f.Dirent.FullName(root)
return name
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index beba0f771..6ec721022 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -67,7 +67,7 @@ type SpliceOpts struct {
// - File.Flags(): This value may change during the operation.
type FileOperations interface {
// Release release resources held by FileOperations.
- Release()
+ Release(ctx context.Context)
// Waitable defines how this File can be waited on for read and
// write readiness.
@@ -159,7 +159,9 @@ type FileOperations interface {
// io provides access to the virtual memory space to which pointers in args
// refer.
//
- // Preconditions: The AddressSpace (if any) that io refers to is activated.
+ // Preconditions:
+ // * The AddressSpace (if any) that io refers to is activated.
+ // * Must only be called from a task goroutine.
Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
}
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index dcc1df38f..9dc58d5ff 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err
// Drop the extra reference on the Dirent. Now there's only one reference
// on the dirent, either owned by f (if non-nil), or the Dirent is about
// to be destroyed (if GetFile failed).
- dirent.DecRef()
+ dirent.DecRef(ctx)
return f, err
}
@@ -89,12 +89,12 @@ type overlayFileOperations struct {
}
// Release implements FileOperations.Release.
-func (f *overlayFileOperations) Release() {
+func (f *overlayFileOperations) Release(ctx context.Context) {
if f.upper != nil {
- f.upper.DecRef()
+ f.upper.DecRef(ctx)
}
if f.lower != nil {
- f.lower.DecRef()
+ f.lower.DecRef(ctx)
}
}
@@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &DirCtx{
@@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
if err != nil {
return nil, err
}
- defer dir.DecRef()
+ defer dir.DecRef(ctx)
// Use a stub serializer to read the entries into memory.
stubSerializer := &CollectEntriesSerializer{}
@@ -521,10 +521,10 @@ type overlayMappingIdentity struct {
}
// DecRef implements AtomicRefCount.DecRef.
-func (omi *overlayMappingIdentity) DecRef() {
- omi.AtomicRefCount.DecRefWithDestructor(func() {
- omi.overlayFile.DecRef()
- omi.id.DecRef()
+func (omi *overlayMappingIdentity) DecRef(ctx context.Context) {
+ omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+ omi.overlayFile.DecRef(ctx)
+ omi.id.DecRef(ctx)
})
}
@@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 {
func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
name, _ := omi.overlayFile.Dirent.FullName(root)
return name
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 084da2a8d..d41f30bbb 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -87,20 +87,6 @@ func RegisterFilesystem(f Filesystem) {
filesystems.registered[f.Name()] = f
}
-// UnregisterFilesystem removes a file system from the global set. To keep the
-// file system set compatible with save/restore, UnregisterFilesystem must be
-// called before save/restore methods.
-//
-// For instance, packages may unregister their file system after it is mounted.
-// This makes sense for pseudo file systems that should not be visible or
-// mountable. See whitelistfs in fs/host/fs.go for one example.
-func UnregisterFilesystem(name string) {
- filesystems.mu.Lock()
- defer filesystems.mu.Unlock()
-
- delete(filesystems.registered, name)
-}
-
// FindFilesystem returns a Filesystem registered at name or (nil, false) if name
// is not a file system type that can be found in /proc/filesystems.
func FindFilesystem(name string) (Filesystem, bool) {
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index bdba6efe5..d2dbff268 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -42,9 +42,10 @@
// Dirent.dirMu
// Dirent.mu
// DirentCache.mu
-// Locks in InodeOperations implementations or overlayEntry
// Inode.Watches.mu (see `Inotify` for other lock ordering)
// MountSource.mu
+// Inode.appendMu
+// Locks in InodeOperations implementations or overlayEntry
//
// If multiple Dirent or MountSource locks must be taken, locks in the parent must be
// taken before locks in their children.
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 789369220..5fb419bcd 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -8,7 +8,6 @@ go_template_instance(
out = "dirty_set_impl.go",
imports = {
"memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
},
package = "fsutil",
prefix = "Dirty",
@@ -25,14 +24,14 @@ go_template_instance(
name = "frame_ref_set_impl",
out = "frame_ref_set_impl.go",
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "fsutil",
prefix = "FrameRef",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "uint64",
"Functions": "FrameRefSetFunctions",
},
@@ -43,7 +42,6 @@ go_template_instance(
out = "file_range_set_impl.go",
imports = {
"memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
},
package = "fsutil",
prefix = "FileRange",
@@ -86,7 +84,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
- "//pkg/sentry/platform",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/state",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index c6cd45087..2c9446c1d 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
// repeatedly until all bytes have been written. max is the true size of the
// cached object; offsets beyond max will not be passed to writeAt, even if
// they are marked dirty.
-func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
var changedDirty bool
defer func() {
if changedDirty {
@@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
// successful partial write, SyncDirtyAll will call it repeatedly until all
// bytes have been written. max is the true size of the cached object; offsets
// beyond max will not be passed to writeAt, even if they are marked dirty.
-func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
dseg := dirty.FirstSegment()
for dseg.Ok() {
if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
@@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max
}
// Preconditions: mr must be page-aligned.
-func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
wbr := cseg.Range().Intersect(mr)
if max < wbr.Start {
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 08695391c..dc9efa5df 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -31,7 +31,7 @@ import (
type FileNoopRelease struct{}
// Release is a no-op.
-func (FileNoopRelease) Release() {}
+func (FileNoopRelease) Release(context.Context) {}
// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor
// is not nil and the seek was on a directory, the cursor will be updated.
@@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren
func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 5643cdac9..1dc409d38 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -23,13 +23,12 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/usermem"
)
// FileRangeSet maps offsets into a memmap.Mappable to offsets into a
-// platform.File. It is used to implement Mappables that store data in
+// memmap.File. It is used to implement Mappables that store data in
// sparsely-allocated memory.
//
// type FileRangeSet <generated by go_generics>
@@ -65,33 +64,38 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli
}
// FileRange returns the FileRange mapped by seg.
-func (seg FileRangeIterator) FileRange() platform.FileRange {
+func (seg FileRangeIterator) FileRange() memmap.FileRange {
return seg.FileRangeOf(seg.Range())
}
// FileRangeOf returns the FileRange mapped by mr.
//
-// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
-func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange {
+// Preconditions:
+// * seg.Range().IsSupersetOf(mr).
+// * mr.Length() != 0.
+func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
frstart := seg.Value() + (mr.Start - seg.Start())
- return platform.FileRange{frstart, frstart + mr.Length()}
+ return memmap.FileRange{frstart, frstart + mr.Length()}
}
// Fill attempts to ensure that all memmap.Mappable offsets in required are
-// mapped to a platform.File offset, by allocating from mf with the given
+// mapped to a memmap.File offset, by allocating from mf with the given
// memory usage kind and invoking readAt to store data into memory. (If readAt
// returns a successful partial read, Fill will call it repeatedly until all
// bytes have been read.) EOF is handled consistently with the requirements of
// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
-// invalid.
+// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
+// will be zeroed without calling readAt.
//
// Fill may read offsets outside of required, but will never read offsets
// outside of optional. It returns a non-nil error if any error occurs, even
// if the error only affects offsets in optional, but not in required.
//
-// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-// required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+// Preconditions:
+// * required.Length() > 0.
+// * optional.IsSupersetOf(required).
+// * required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
gap := frs.LowerBoundGap(required.Start)
for gap.Ok() && gap.Start() < required.End {
if gap.Range().Length() == 0 {
@@ -104,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
var done uint64
for !dsts.IsEmpty() {
- n, err := readAt(ctx, dsts, gr.Start+done)
+ n, err := func() (uint64, error) {
+ off := gr.Start + done
+ if off >= fileSize {
+ return 0, io.EOF
+ }
+ if off+dsts.NumBytes() > fileSize {
+ rd := fileSize - off
+ n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
+ if n == rd && err == nil {
+ return n, io.EOF
+ }
+ return n, err
+ }
+ return readAt(ctx, dsts, off)
+ }()
done += n
dsts = dsts.DropFirst64(n)
if err != nil {
@@ -141,7 +159,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
}
// Drop removes segments for memmap.Mappable offsets in mr, freeing the
-// corresponding platform.FileRanges.
+// corresponding memmap.FileRanges.
//
// Preconditions: mr must be page-aligned.
func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
@@ -154,7 +172,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
}
// DropAll removes all segments in mr, freeing the corresponding
-// platform.FileRanges.
+// memmap.FileRanges.
func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
mf.DecRef(seg.FileRange())
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd6f5aba6..a808894df 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -17,7 +17,7 @@ package fsutil
import (
"math"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usage"
)
@@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) {
}
// Merge implements segment.Functions.Merge.
-func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) {
if val1 != val2 {
return 0, false
}
@@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
}
// Split implements segment.Functions.Split.
-func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) {
return val, val
}
// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
// are accounted as host page cache memory mappings.
-func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) {
seg, gap := refs.Find(fr.Start)
for {
switch {
@@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
// DecRefAndAccount removes a reference on the range fr and untracks segments
// that are removed from memory accounting.
-func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) {
seg := refs.FindSegment(fr.Start)
for seg.Ok() && seg.Start() < fr.End {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index e82afd112..4468f5dd2 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -71,6 +70,13 @@ func (f *HostFileMapper) Init() {
f.mappings = make(map[uint64]mapping)
}
+// IsInited returns true if f.Init() has been called. This is used when
+// restoring a checkpoint that contains a HostFileMapper that may or may not
+// have been initialized.
+func (f *HostFileMapper) IsInited() bool {
+ return f.refs != nil
+}
+
// NewHostFileMapper returns an initialized HostFileMapper allocated on the
// heap with no references or cached mappings.
func NewHostFileMapper() *HostFileMapper {
@@ -81,7 +87,9 @@ func NewHostFileMapper() *HostFileMapper {
// IncRefOn increments the reference count on all offsets in mr.
//
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
f.refsMu.Lock()
defer f.refsMu.Unlock()
@@ -98,7 +106,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
// DecRefOn decrements the reference count on all offsets in mr.
//
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
f.refsMu.Lock()
defer f.refsMu.Unlock()
@@ -126,7 +136,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
// offsets in fr or until the next call to UnmapAll.
//
// Preconditions: The caller must hold a reference on all offsets in fr.
-func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
+func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
@@ -146,7 +156,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool)
}
// Preconditions: f.mapsMu must be locked.
-func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error {
+func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error {
prot := syscall.PROT_READ
if write {
prot |= syscall.PROT_WRITE
@@ -205,7 +215,9 @@ func (f *HostFileMapper) UnmapAll() {
}
}
-// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+// Preconditions:
+// * f.mapsMu must be locked.
+// * f.mappings[chunkStart] == m.
func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
// This leaks address space and is unexpected, but is otherwise
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 78fec553e..c15d8a946 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -21,18 +21,17 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
-// HostMappable implements memmap.Mappable and platform.File over a
+// HostMappable implements memmap.Mappable and memmap.File over a
// CachedFileObject.
//
// Lock order (compare the lock order model in mm/mm.go):
// truncateMu ("fs locks")
// mu ("memmap.Mappable locks not taken by Translate")
-// ("platform.File locks")
+// ("memmap.File locks")
// backingFile ("CachedFileObject locks")
//
// +stateify savable
@@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error {
return nil
}
-// MapInternal implements platform.File.MapInternal.
-func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (h *HostMappable) FD() int {
return h.backingFile.FD()
}
-// IncRef implements platform.File.IncRef.
-func (h *HostMappable) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (h *HostMappable) IncRef(fr memmap.FileRange) {
mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
h.hostFileMapper.IncRefOn(mr)
}
-// DecRef implements platform.File.DecRef.
-func (h *HostMappable) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (h *HostMappable) DecRef(fr memmap.FileRange) {
mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
h.hostFileMapper.DecRefOn(mr)
}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 800c8b4e1..82eda3e43 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -22,11 +22,9 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/time"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -445,7 +443,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
// time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) {
c.attr.AccessTime = now
c.dirtyAttr.AccessTime = true
}
@@ -462,7 +460,7 @@ func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx contex
// and status change times in-place to the current time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) {
c.attr.ModificationTime = now
c.dirtyAttr.ModificationTime = true
c.attr.StatusChangeTime = now
@@ -481,7 +479,7 @@ func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
// in-place to the current time.
//
// Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) {
c.attr.StatusChangeTime = now
c.dirtyAttr.StatusChangeTime = true
}
@@ -646,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
End: fs.OffsetPageEnd(int64(gapMR.End)),
}
optMR := gap.Range()
- err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+ err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.c.cache.Find(uint64(rw.offset))
if !seg.Ok() {
@@ -673,9 +671,6 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// Continue.
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
}
-
- default:
- break
}
}
unlock()
@@ -685,7 +680,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// maybeGrowFile grows the file's size if data has been written past the old
// size.
//
-// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+// Preconditions:
+// * rw.c.attrMu must be locked.
+// * rw.c.dataMu must be locked.
func (rw *inodeReadWriter) maybeGrowFile() {
// If the write ends beyond the file's previous size, it causes the
// file to grow.
@@ -767,9 +764,6 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
// Continue.
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
- default:
- break
}
}
rw.maybeGrowFile()
@@ -876,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
}
mf := c.mfp.MemoryFile()
- cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
+ cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
@@ -934,7 +928,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
- // Whether we have a host fd (and consequently what platform.File is
+ // Whether we have a host fd (and consequently what memmap.File is
// mapped) can change across save/restore, so invalidate all translations
// unconditionally.
c.mapsMu.Lock()
@@ -999,10 +993,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable
}
}
-// IncRef implements platform.File.IncRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// IncRef implements memmap.File.IncRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
-func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) {
// Hot path. Avoid defers.
c.dataMu.Lock()
seg, gap := c.refs.Find(fr.Start)
@@ -1024,10 +1018,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
}
}
-// DecRef implements platform.File.DecRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// DecRef implements memmap.File.DecRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
-func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) {
// Hot path. Avoid defers.
c.dataMu.Lock()
seg := c.refs.FindSegment(fr.Start)
@@ -1046,15 +1040,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
c.dataMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal. This is used when we
+// MapInternal implements memmap.File.MapInternal. This is used when we
// directly map an underlying host fd and CachingInodeOperations is used as the
-// platform.File during translation.
-func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// memmap.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
}
-// FD implements platform.File.FD. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// FD implements memmap.File.FD. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
func (c *CachingInodeOperations) FD() int {
return c.backingFile.FD()
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index b43c082a7..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -76,9 +76,10 @@ ops can be implemented in parallel.
#### Minimal client that can mount a trivial FUSE filesystem.
-- Implement `/dev/fuse`.
+- Implement `/dev/fuse` - a character device used to establish an FD for
+ communication between the sentry and the server daemon.
-- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+- Implement basic FUSE ops like `FUSE_INIT`.
#### Read-only mount with basic file operations
@@ -94,6 +95,103 @@ ops can be implemented in parallel.
- Implement the remaining FUSE ops and decide if we can omit rarely used
operations like ioctl.
+### Design Details
+
+#### Lifecycle for a FUSE Request
+
+- User invokes a syscall
+- Sentry prepares corresponding request
+ - If FUSE device is available
+ - Write the request in binary
+ - If FUSE device is full
+ - Kernel task blocked until available
+- Sentry notifies the readers of fuse device that it's ready for read
+- FUSE daemon reads the request and processes it
+- Sentry waits until a reply is written to the FUSE device
+ - but returns directly for async requests
+- FUSE daemon writes to the fuse device
+- Sentry processes the reply
+ - For sync requests, unblock blocked kernel task
+ - For async requests, execute pre-specified callback if any
+- Sentry returns the syscall to the user
+
+#### Channels and Queues for Requests in Different Stages
+
+`connection.initializedChan`
+
+- a channel that the requests issued before connection initialization blocks
+ on.
+
+`fd.queue`
+
+- a queue of requests that haven’t been read by the FUSE daemon yet.
+
+`fd.completions`
+
+- a map of the requests that have been prepared but not yet received a
+ response, including the ones on the `fd.queue`.
+
+`fd.waitQueue`
+
+- a queue of waiters that is waiting for the fuse device fd to be available,
+ such as the FUSE daemon.
+
+`fd.fullQueueCh`
+
+- a channel that the kernel task will be blocked on when the fd is not
+ available.
+
+#### Basic I/O Implementation
+
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
+
+##### Basic FUSE Read
+
+The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
+`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually
+reach our implementation of those interface functions located at
+`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
+
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
+
+##### Basic FUSE Write
+
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
+
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+
+The parameters `max_write` and `max_pages` restrict the number of bytes in one
+`FUSE_WRITE`. There are code handling fragmented writes in current
+implementation.
+
+To have better performance: the extra copy created to store the bytes to write
+can be replaced by the buffer cache as well.
+
# Appendix
## FUSE Protocol
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index b2fcab127..c0bc63a32 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
}
// Release implements fs.FileOpeations.Release.
-func (f *fileOperations) Release() {
+func (f *fileOperations) Release(context.Context) {
f.handles.DecRef()
}
@@ -122,7 +122,7 @@ func (f *fileOperations) Release() {
func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 2df2fe889..326fed954 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) {
// We must release the dirent, of the test will fail
// with a reference leak. This is tracked by p9test.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
// Walk again. Depending on the cache policy, we may
// get a new dirent.
@@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) {
if !test.preModificationWantReload && dirent != newDirent {
t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent)
}
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
// Modify the underlying mocked file's modification
// time for the next walk that occurs.
@@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) {
if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds)
}
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
// Remove the file from the remote fs, subsequent walks
// should now fail to find anything.
@@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) {
t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err)
}
if err == nil {
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
}
})
}
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index fc14249be..f324dbf26 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -47,7 +47,8 @@ type handles struct {
// DecRef drops a reference on handles.
func (h *handles) DecRef() {
- h.DecRefWithDestructor(func() {
+ ctx := context.Background()
+ h.DecRefWithDestructor(ctx, func(context.Context) {
if h.Host != nil {
if h.isHostBorrowed {
h.Host.Release()
@@ -57,7 +58,7 @@ func (h *handles) DecRef() {
}
}
}
- if err := h.File.close(context.Background()); err != nil {
+ if err := h.File.close(ctx); err != nil {
log.Warningf("error closing p9 file: %v", err)
}
})
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index a016c896e..3a225fd39 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) {
// asynchronously.
//
// We use AsyncWithContext to avoid needing to allocate an extra
- // anonymous function on the heap.
- fs.AsyncWithContext(ctx, i.fileState.Release)
+ // anonymous function on the heap. We must use background context
+ // because the async work cannot happen on the task context.
+ fs.AsyncWithContext(context.Background(), i.fileState.Release)
}
// Mappable implements fs.InodeOperations.Mappable.
@@ -640,7 +641,7 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
- if !i.session().cachePolicy.cacheUAttrs(inode) {
+ if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) {
return nil
}
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index cf9800100..3c66dc3c2 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
// Construct the positive Dirent.
d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
- defer d.DecRef()
+ defer d.DecRef(ctx)
// Construct the new file, caching the handles if allowed.
h := handles{
@@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
// Find out if file being deleted is a socket or pipe that needs to be
// removed from endpoint map.
if d, err := i.Lookup(ctx, dir, name); err == nil {
- defer d.DecRef()
+ defer d.DecRef(ctx)
if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) {
switch iops := d.Inode.InodeOperations.(type) {
@@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
return err
}
if key != nil {
- i.session().overrides.remove(*key)
+ i.session().overrides.remove(ctx, *key)
}
i.touchModificationAndStatusChangeTime(ctx, dir)
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index b5efc86f2..7cf3522ff 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f
// remove deletes the key from the maps.
//
// Precondition: maps must have been locked with 'lock'.
-func (e *overrideMaps) remove(key device.MultiDeviceKey) {
+func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) {
endpoint := e.keyMap[key]
delete(e.keyMap, key)
- endpoint.dirent.DecRef()
+ endpoint.dirent.DecRef(ctx)
}
// lock blocks other addition and removal operations from happening while
@@ -197,7 +197,7 @@ type session struct {
}
// Destroy tears down the session.
-func (s *session) Destroy() {
+func (s *session) Destroy(ctx context.Context) {
s.client.Close()
}
@@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
s.client, err = p9.NewClient(conn, s.msize, s.version)
if err != nil {
// Drop our reference on the session, it needs to be torn down.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
ctx.UninterruptibleSleepFinish(false)
if err != nil {
// Same as above.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
if err != nil {
s.attach.close(ctx)
// Same as above, but after we execute the Close request.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error {
// fillPathMap populates paths for overrides from dirents in direntMap
// before save.
-func (s *session) fillPathMap() error {
+func (s *session) fillPathMap(ctx context.Context) error {
unlock := s.overrides.lock()
defer unlock()
for _, endpoint := range s.overrides.keyMap {
mountRoot := endpoint.dirent.MountRoot()
- defer mountRoot.DecRef()
+ defer mountRoot.DecRef(ctx)
dirPath, _ := endpoint.dirent.FullName(mountRoot)
if dirPath == "" {
return fmt.Errorf("error getting path from dirent")
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 2d398b753..48b423dd8 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -26,7 +26,8 @@ import (
// beforeSave is invoked by stateify.
func (s *session) beforeSave() {
if s.overrides != nil {
- if err := s.fillPathMap(); err != nil {
+ ctx := &dummyClockContext{context.Background()}
+ if err := s.fillPathMap(ctx); err != nil {
panic("failed to save paths to override map before saving" + err.Error())
}
}
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 40f2c1cad..8a1c69ac2 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
// We don't need the receiver.
c.CloseRecv()
- c.Release()
+ c.Release(ctx)
return c, nil
}
// Release implements transport.BoundEndpoint.Release.
-func (e *endpoint) Release() {
- e.inode.DecRef()
+func (e *endpoint) Release(ctx context.Context) {
+ e.inode.DecRef(ctx)
}
// Passcred implements transport.BoundEndpoint.Passcred.
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index aabce6cc9..1368014c4 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -30,7 +30,9 @@ go_library(
"//pkg/context",
"//pkg/fd",
"//pkg/fdnotifier",
+ "//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/secio",
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 39299b7e4..0d8d36afa 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (c *scmRights) Release() {
+func (c *scmRights) Release(ctx context.Context) {
for _, fd := range c.fds {
syscall.Close(fd)
}
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 3e48b8b2c..86d1a87f0 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool
name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
dirent := fs.NewDirent(ctx, inode, name)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
if isTTY {
return newTTYFile(ctx, dirent, flags, iops), nil
@@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 62f1246aa..fbfba1b58 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -368,6 +368,9 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+ if inode.MountSource.Flags.ReadOnly {
+ return nil
+ }
// Have we been using host kernel metadata caches?
if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
// Then the metadata is already up to date on the host.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index c507f57eb..41a23b5da 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) {
if err != nil {
t.Fatalf("Failed to create File: %v", err)
}
- file.DecRef()
+ file.DecRef(ctx)
s := make([]byte, 10)
if c, err := syscall.Read(p[0], s); c != 0 || err != nil {
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index b6e94583e..a2f3d5918 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/fdnotifier"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -39,11 +38,6 @@ import (
// LINT.IfChange
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
// ConnectedEndpoint is a host FD backed implementation of
// transport.ConnectedEndpoint and transport.Receiver.
//
@@ -103,10 +97,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
if err != nil {
return syserr.FromError(err)
}
- if sndbuf > maxSendBufferSize {
- log.Warningf("Socket send buffer too large: %d", sndbuf)
- return syserr.ErrInvalidEndpointState
- }
c.stype = linux.SockType(stype)
c.sndbuf = int64(sndbuf)
@@ -204,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
}
// Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -281,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
}
// Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -328,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek
}
// close releases all resources related to the endpoint.
-func (c *ConnectedEndpoint) close() {
+func (c *ConnectedEndpoint) close(context.Context) {
fdnotifier.RemoveFD(int32(c.file.FD()))
c.file.Close()
c.file = nil
@@ -384,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
}
// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release.
-func (c *ConnectedEndpoint) Release() {
- c.ref.DecRefWithDestructor(c.close)
+func (c *ConnectedEndpoint) Release(ctx context.Context) {
+ c.ref.DecRefWithDestructor(ctx, c.close)
}
// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 5c18dbd5e..905afb50d 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -17,15 +17,12 @@ package host
import (
"syscall"
- "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/iovec"
"gvisor.dev/gvisor/pkg/syserror"
)
// LINT.IfChange
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
// copyToMulti copies as many bytes from src to dst as possible.
func copyToMulti(dst [][]byte, src []byte) {
for _, d := range dst {
@@ -76,7 +73,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
}
}
- if iovsRequired > maxIovs {
+ if iovsRequired > iovec.MaxIovs {
// The kernel will reject our call if we pass this many iovs.
// Use a single intermediate buffer instead.
b := make([]byte, stopLen)
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index affdbcacb..9d58ea448 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) {
if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
t.Fatalf("Expected socket %v to be blocking", pair[1])
}
- sock, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sock, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
}
- defer sock.DecRef()
+ defer sock.DecRef(ctx)
// Test that the socket now is non-blocking.
if fl, err = getFl(pair[0]); err != nil {
t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
@@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ socket, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer socket.DecRef()
+ defer socket.DecRef(ctx)
buf := []byte("hello world\n")
n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf))
if err != nil {
@@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ socket, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer socket.DecRef()
+ defer socket.DecRef(ctx)
n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil))
if err != nil {
t.Fatalf("socket writev failed: %v", err)
@@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- sfile, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sfile, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer sfile.DecRef()
+ defer sfile.DecRef(ctx)
s := sfile.FileOperations.(socket.Socket)
n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{})
@@ -158,18 +162,19 @@ func TestListen(t *testing.T) {
if err != nil {
t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
}
- sfile1, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sfile1, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer sfile1.DecRef()
+ defer sfile1.DecRef(ctx)
socket1 := sfile1.FileOperations.(socket.Socket)
- sfile2, err := newSocket(contexttest.Context(t), pair[1], false)
+ sfile2, err := newSocket(ctx, pair[1], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[1], err)
}
- defer sfile2.DecRef()
+ defer sfile2.DecRef(ctx)
socket2 := sfile2.FileOperations.(socket.Socket)
// Socketpairs can not be listened to.
@@ -185,11 +190,11 @@ func TestListen(t *testing.T) {
if err != nil {
t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
}
- sfile3, err := newSocket(contexttest.Context(t), sock, false)
+ sfile3, err := newSocket(ctx, sock, false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", sock, err)
}
- defer sfile3.DecRef()
+ defer sfile3.DecRef(ctx)
socket3 := sfile3.FileOperations.(socket.Socket)
// This socket is not bound so we can't listen on it.
@@ -237,9 +242,10 @@ func TestRelease(t *testing.T) {
}
c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
want := &ConnectedEndpoint{queue: c.queue}
- want.ref.DecRef()
+ ctx := contexttest.Context(t)
+ want.ref.DecRef(ctx)
fdnotifier.AddFD(int32(c.file.FD()), nil)
- c.Release()
+ c.Release(ctx)
if !reflect.DeepEqual(c, want) {
t.Errorf("got = %#v, want = %#v", c, want)
}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5d4f312cf..c8231e0aa 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index cb91355ab..1183727ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -53,7 +54,7 @@ type TTYFileOperations struct {
func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
fileOperations: fileOperations{iops: iops},
- termios: linux.DefaultSlaveTermios,
+ termios: linux.DefaultReplicaTermios,
})
}
@@ -113,16 +114,21 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme
}
// Release implements fs.FileOperations.Release.
-func (t *TTYFileOperations) Release() {
+func (t *TTYFileOperations) Release(ctx context.Context) {
t.mu.Lock()
t.fgProcessGroup = nil
t.mu.Unlock()
- t.fileOperations.Release()
+ t.fileOperations.Release(ctx)
}
// Ioctl implements fs.FileOperations.Ioctl.
func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ return 0, syserror.ENOTTY
+ }
+
// Ignore arg[0]. This is the real FD:
fd := t.fileOperations.iops.fileState.FD()
ioctl := args[1].Uint64()
@@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = termios.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
}
var termios linux.Termios
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetTermios(fd, ioctl, &termios)
@@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// Map the ProcessGroup into a ProcessGroupID in the task's PID
// namespace.
- pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+ _, err := pgID.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSPGRP:
@@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// Equivalent to tcsetpgrp(fd, *argp).
// Set the foreground process group ID of this terminal.
- task := kernel.TaskFromContext(ctx)
- if task == nil {
- return 0, syserror.ENOTTY
- }
-
t.mu.Lock()
defer t.mu.Unlock()
@@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
return 0, syserror.ENOTTY
}
- var pgID kernel.ProcessGroupID
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgIDP primitive.Int32
+ if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
+ pgID := kernel.ProcessGroupID(pgIDP)
// pgID must be non-negative.
if pgID < 0 {
@@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = winsize.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSWINSZ:
@@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
// background ones) can set the winsize.
var winsize linux.Winsize
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetWinsize(fd, &winsize)
@@ -308,9 +298,9 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
task := kernel.TaskFromContext(ctx)
if task == nil {
// No task? Linux does not have an analog for this case, but
- // tty_check_change is more of a blacklist of cases than a
- // whitelist, and is surprisingly permissive. Allowing the
- // change seems most appropriate.
+ // tty_check_change only blocks specific cases and is
+ // surprisingly permissive. Allowing the change seems
+ // appropriate.
return nil
}
@@ -358,7 +348,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
//
// Linux ignores the result of kill_pgrp().
_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
- return kernel.ERESTARTSYS
+ return syserror.ERESTARTSYS
}
// LINT.ThenChange(../../fsimpl/host/tty.go)
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index ce397a5e3..c143f4ce2 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -39,7 +39,7 @@ func TestWait(t *testing.T) {
t.Fatalf("NewFile failed: %v", err)
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
r := file.Readiness(waiter.EventIn)
if r != 0 {
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index a34fbc946..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt
}
// DecRef drops a reference on the Inode.
-func (i *Inode) DecRef() {
- i.DecRefWithDestructor(i.destroy)
+func (i *Inode) DecRef(ctx context.Context) {
+ i.DecRefWithDestructor(ctx, i.destroy)
}
// destroy releases the Inode and releases the msrc reference taken.
-func (i *Inode) destroy() {
- ctx := context.Background()
+func (i *Inode) destroy(ctx context.Context) {
if err := i.WriteOut(ctx); err != nil {
// FIXME(b/65209558): Mark as warning again once noatime is
// properly supported.
@@ -122,12 +121,12 @@ func (i *Inode) destroy() {
i.Watches.targetDestroyed()
if i.overlay != nil {
- i.overlay.release()
+ i.overlay.release(ctx)
} else {
i.InodeOperations.Release(ctx)
}
- i.MountSource.DecRef()
+ i.MountSource.DecRef(ctx)
}
// Mappable calls i.InodeOperations.Mappable.
@@ -271,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
// SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
if i.overlay != nil {
- return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+ return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
}
return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
}
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index efd3c962b..9911a00c2 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -17,6 +17,7 @@ package fs
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) {
}
// Unpin unpins dirent from all watches in this set.
-func (w *Watches) Unpin(d *Dirent) {
+func (w *Watches) Unpin(ctx context.Context, d *Dirent) {
w.mu.RLock()
defer w.mu.RUnlock()
for _, watch := range w.ws {
- watch.Unpin(d)
+ watch.Unpin(ctx, d)
}
}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 537c8d257..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
import (
"fmt"
- "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -85,7 +84,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
upperInode = child.Inode
upperInode.IncRef()
}
- child.DecRef()
+ child.DecRef(ctx)
}
// Are we done?
@@ -108,7 +107,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
entry, err := newOverlayEntry(ctx, upperInode, nil, false)
if err != nil {
// Don't leak resources.
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
parent.copyMu.RUnlock()
return nil, false, err
}
@@ -129,7 +128,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
if err != nil && err != syserror.ENOENT {
// Don't leak resources.
if upperInode != nil {
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
}
parent.copyMu.RUnlock()
return nil, false, err
@@ -152,7 +151,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
}
}
}
- child.DecRef()
+ child.DecRef(ctx)
}
}
@@ -183,7 +182,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
// unnecessary because we don't need to copy-up and we will always
// operate (e.g. read/write) on the upper Inode.
if !IsDir(upperInode.StableAttr) {
- lowerInode.DecRef()
+ lowerInode.DecRef(ctx)
lowerInode = nil
}
}
@@ -194,10 +193,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
// Well, not quite, we failed at the last moment, how depressing.
// Be sure not to leak resources.
if upperInode != nil {
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
}
if lowerInode != nil {
- lowerInode.DecRef()
+ lowerInode.DecRef(ctx)
}
parent.copyMu.RUnlock()
return nil, false, err
@@ -248,7 +247,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
// user) will clobber the real path for the underlying Inode.
upperFile.Dirent.Inode.IncRef()
upperDirent := NewTransientDirent(upperFile.Dirent.Inode)
- upperFile.Dirent.DecRef()
+ upperFile.Dirent.DecRef(ctx)
upperFile.Dirent = upperDirent
// Create the overlay inode and dirent. We need this to construct the
@@ -259,7 +258,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
// The overlay file created below with NewFile will take a reference on
// the overlayDirent, and it should be the only thing holding a
// reference at the time of creation, so we must drop this reference.
- defer overlayDirent.DecRef()
+ defer overlayDirent.DecRef(ctx)
// Create a new overlay file that wraps the upper file.
flags.Pread = upperFile.Flags().Pread
@@ -399,7 +398,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
children, err := readdirOne(ctx, replaced)
if err != nil {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return err
}
@@ -407,12 +406,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
// included among the returned children, so we don't
// need to bother checking for them.
if len(children) > 0 {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syserror.ENOTEMPTY
}
}
- replaced.DecRef()
+ replaced.DecRef(ctx)
}
}
@@ -455,12 +454,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri
// Grab the inode and drop the dirent, we don't need it.
inode := d.Inode
inode.IncRef()
- d.DecRef()
+ d.DecRef(ctx)
// Create a new overlay entry and dirent for the socket.
entry, err := newOverlayEntry(ctx, inode, nil, false)
if err != nil {
- inode.DecRef()
+ inode.DecRef(ctx)
return nil, err
}
// Use the parent's MountSource, since that corresponds to the overlay,
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
// Don't forward the value of the extended attribute if it would
// unexpectedly change the behavior of a wrapping overlay layer.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return "", syserror.ENODATA
}
@@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
return s, err
}
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
// Don't allow changes to overlay xattrs through a setxattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
for name := range names {
// Same as overlayGetXattr, we shouldn't forward along
// overlay attributes.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
delete(names, name)
}
}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
// Don't allow changes to overlay xattrs through a removexattr syscall.
- if strings.HasPrefix(XattrOverlayPrefix, name) {
+ if isXattrOverlay(name) {
return syserror.EPERM
}
@@ -672,7 +671,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) {
// ground and claim that jumping around the filesystem like this
// is not supported.
name, _ := dirent.FullName(nil)
- dirent.DecRef()
+ dirent.DecRef(ctx)
// Claim that the path is not accessible.
err = syserror.EACCES
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 389c219d6..aa9851b26 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
ctx = &rootContext{
Context: ctx,
@@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) {
}
// Drop the file reference.
- file.DecRef()
+ file.DecRef(ctx)
// Dirent should have 2 refs left.
if got, want := dirent.ReadRefs(), 2; int(got) != want {
@@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) {
}
// Drop our ref.
- dirent.DecRef()
+ dirent.DecRef(ctx)
// We should be back to zero refs.
if got, want := dirent.ReadRefs(), 0; int(got) != want {
@@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags
if err != nil {
return nil, err
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// Wrap the file's FileOperations in a dirFile.
fops := &dirFile{
FileOperations: file.FileOperations,
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index e3a715c1f..c5c07d564 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify {
// Release implements FileOperations.Release. Release removes all watches and
// frees all resources for an inotify instance.
-func (i *Inotify) Release() {
+func (i *Inotify) Release(ctx context.Context) {
// We need to hold i.mu to avoid a race with concurrent calls to
// Inotify.targetDestroyed from Watches. There's no risk of Watches
// accessing this Inotify after the destructor ends, because we remove all
@@ -93,7 +93,7 @@ func (i *Inotify) Release() {
// the owner's destructor.
w.target.Watches.Remove(w.ID())
// Don't leak any references to the target, held by pins in the watch.
- w.destroy()
+ w.destroy(ctx)
}
}
@@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 {
//
// RmWatch looks up an inotify watch for the given 'wd' and configures the
// target dirent to stop sending events to this inotify instance.
-func (i *Inotify) RmWatch(wd int32) error {
+func (i *Inotify) RmWatch(ctx context.Context, wd int32) error {
i.mu.Lock()
// Find the watch we were asked to removed.
@@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error {
i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
// Remove all pins.
- watch.destroy()
+ watch.destroy(ctx)
return nil
}
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 900cba3ca..605423d22 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) {
// Unpin drops any extra refs held on dirent due to a previous Pin
// call. Calling Unpin multiple times for the same dirent, or on a dirent
// without a corresponding Pin call is a no-op.
-func (w *Watch) Unpin(d *Dirent) {
+func (w *Watch) Unpin(ctx context.Context, d *Dirent) {
w.mu.Lock()
defer w.mu.Unlock()
if w.pins[d] {
delete(w.pins, d)
- d.DecRef()
+ d.DecRef(ctx)
}
}
@@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() {
// this watch. Destroy does not cause any new events to be generated. The caller
// is responsible for ensuring there are no outstanding references to this
// watch.
-func (w *Watch) destroy() {
+func (w *Watch) destroy(ctx context.Context) {
w.mu.Lock()
defer w.mu.Unlock()
for d := range w.pins {
- d.DecRef()
+ d.DecRef(ctx)
}
w.pins = nil
}
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 926538d90..8a5d9c7eb 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -62,7 +62,7 @@ import (
type LockType int
// UniqueID is a unique identifier of the holder of a regional file lock.
-type UniqueID uint64
+type UniqueID interface{}
const (
// ReadLock describes a POSIX regional file lock to be taken
@@ -98,12 +98,7 @@ type Lock struct {
// If len(Readers) > 0 then HasWriter must be false.
Readers map[UniqueID]bool
- // HasWriter indicates that this is a write lock held by a single
- // UniqueID.
- HasWriter bool
-
- // Writer is only valid if HasWriter is true. It identifies a
- // single write lock holder.
+ // Writer holds the writer unique ID. It's nil if there are no writers.
Writer UniqueID
}
@@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock {
case ReadLock:
value.Readers[uid] = true
case WriteLock:
- value.HasWriter = true
value.Writer = uid
default:
panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
@@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock {
// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
- if l.HasWriter && l.Writer == uid {
- return true
- }
- return l.Readers[uid]
+ return l.Writer == uid || l.Readers[uid]
}
// lock sets uid as a holder of a typed lock on Lock.
@@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
}
// We cannot downgrade a write lock to a read lock unless the
// uid is the same.
- if l.HasWriter {
+ if l.Writer != nil {
if l.Writer != uid {
panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
}
// Ensure that there is only one reader if upgrading.
l.Readers = make(map[UniqueID]bool)
// Ensure that there is no longer a writer.
- l.HasWriter = false
+ l.Writer = nil
}
l.Readers[uid] = true
return
case WriteLock:
// If we are already the writer, then this is a no-op.
- if l.HasWriter && l.Writer == uid {
+ if l.Writer == uid {
return
}
// We can only upgrade a read lock to a write lock if there
@@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
}
// Ensure that there is only a writer.
l.Readers = make(map[UniqueID]bool)
- l.HasWriter = true
l.Writer = uid
default:
panic(fmt.Sprintf("lock: invalid lock type %d", t))
@@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
switch t {
case ReadLock:
return l.lockable(r, func(value Lock) bool {
- // If there is no writer, there's no problem adding
- // another reader.
- if !value.HasWriter {
+ // If there is no writer, there's no problem adding another reader.
+ if value.Writer == nil {
return true
}
// If there is a writer, then it must be the same uid
@@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
case WriteLock:
return l.lockable(r, func(value Lock) bool {
// If there are only readers.
- if !value.HasWriter {
- // Then this uid can only take a write lock if
- // this is a private upgrade, meaning that the
- // only reader is uid.
+ if value.Writer == nil {
+ // Then this uid can only take a write lock if this is a private
+ // upgrade, meaning that the only reader is uid.
return len(value.Readers) == 1 && value.Readers[uid]
}
// If the uid is already a writer on this region, then
@@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
}
}
-// lock returns true if uid took a lock of type t on the entire range of LockRange.
+// lock returns true if uid took a lock of type t on the entire range of
+// LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
@@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
seg, _ = l.SplitUnchecked(seg, r.End)
}
- // Set the lock on the segment. This is guaranteed to
+ // Set the lock on the segment. This is guaranteed to
// always be safe, given canLock above.
value := seg.ValuePtr()
value.lock(uid, t)
@@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) {
value := seg.Value()
var remove bool
- if value.HasWriter && value.Writer == uid {
+ if value.Writer == uid {
// If we are unlocking a writer, then since there can
// only ever be one writer and no readers, then this
// lock should always be removed from the set.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 8a3ace0c1..50a16e662 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock)
return Lock{}, false
}
}
- if val1.HasWriter != val2.HasWriter {
+ if val1.Writer != val2.Writer {
return Lock{}, false
}
- if val1.HasWriter {
- if val1.Writer != val2.Writer {
- return Lock{}, false
- }
- }
return val1, true
}
@@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock)
for k, v := range val.Readers {
val0.Readers[k] = v
}
- val0.HasWriter = val.HasWriter
val0.Writer = val.Writer
return val, val0
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index ba002aeb7..fad90984b 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool {
if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) {
return false
}
- if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter {
- return false
- }
if e0[i].Lock.Writer != e1[i].Lock.Writer {
return false
}
@@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) {
LockRange: LockRange{2048, 3072},
},
{
- Lock: Lock{HasWriter: true, Writer: 1},
+ Lock: Lock{Writer: 1},
LockRange: LockRange{3072, 4096},
},
})
@@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 4096},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{4096, LockEOF},
},
},
@@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) {
// 0 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 4096},
},
{
@@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 4096},
},
{
@@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 3072},
},
{
@@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) {
// 0 1024 2048 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) {
// 0 1024 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, LockEOF},
},
},
@@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) {
}
for _, test := range tests {
- l := fill(test.before)
+ t.Run(test.name, func(t *testing.T) {
+ l := fill(test.before)
- r := LockRange{Start: test.start, End: test.end}
- success := l.lock(test.uid, test.lockType, r)
- var got []entry
- for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- got = append(got, entry{
- Lock: seg.Value(),
- LockRange: seg.Range(),
- })
- }
+ r := LockRange{Start: test.start, End: test.end}
+ success := l.lock(test.uid, test.lockType, r)
+ var got []entry
+ for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ got = append(got, entry{
+ Lock: seg.Value(),
+ LockRange: seg.Range(),
+ })
+ }
- if success != test.success {
- t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success)
- continue
- }
+ if success != test.success {
+ t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success)
+ return
+ }
- if success {
- if !equals(got, test.after) {
- t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+ if success {
+ if !equals(got, test.after) {
+ t.Errorf("got set %+v, want %+v", got, test.after)
+ }
}
- }
+ })
}
}
@@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) {
// 0 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{4096, LockEOF},
},
},
@@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) {
// 0 4096
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 4096},
},
},
@@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 4096},
},
{
@@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) {
// 0 8 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 8},
},
{
@@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 8192 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) {
}
for _, test := range tests {
- l := fill(test.before)
+ t.Run(test.name, func(t *testing.T) {
+ l := fill(test.before)
- r := LockRange{Start: test.start, End: test.end}
- l.unlock(test.uid, r)
- var got []entry
- for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- got = append(got, entry{
- Lock: seg.Value(),
- LockRange: seg.Range(),
- })
- }
- if !equals(got, test.after) {
- t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
- }
+ r := LockRange{Start: test.start, End: test.end}
+ l.unlock(test.uid, r)
+ var got []entry
+ for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ got = append(got, entry{
+ Lock: seg.Value(),
+ LockRange: seg.Range(),
+ })
+ }
+ if !equals(got, test.after) {
+ t.Errorf("got set %+v, want %+v", got, test.after)
+ }
+ })
}
}
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 37bae6810..ee69b10e8 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -51,7 +51,7 @@ type MountSourceOperations interface {
DirentOperations
// Destroy destroys the MountSource.
- Destroy()
+ Destroy(ctx context.Context)
// Below are MountSourceOperations that do not conform to Linux.
@@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() {
}
}
-func (msrc *MountSource) destroy() {
+func (msrc *MountSource) destroy(ctx context.Context) {
if c := msrc.DirentRefs(); c != 0 {
panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c))
}
- msrc.MountSourceOperations.Destroy()
+ msrc.MountSourceOperations.Destroy(ctx)
}
// DecRef drops a reference on the MountSource.
-func (msrc *MountSource) DecRef() {
- msrc.DecRefWithDestructor(msrc.destroy)
+func (msrc *MountSource) DecRef(ctx context.Context) {
+ msrc.DecRefWithDestructor(ctx, msrc.destroy)
}
// FlushDirentRefs drops all references held by the MountSource on Dirents.
@@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {}
func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {}
// Destroy implements MountSourceOperations.Destroy.
-func (*SimpleMountSourceOperations) Destroy() {}
+func (*SimpleMountSourceOperations) Destroy(context.Context) {}
// Info defines attributes of a filesystem.
type Info struct {
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 78e35b1e6..7badc75d6 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin
}
// Destroy drops references on the upper and lower MountSource.
-func (o *overlayMountSourceOperations) Destroy() {
- o.upper.DecRef()
- o.lower.DecRef()
+func (o *overlayMountSourceOperations) Destroy(ctx context.Context) {
+ o.upper.DecRef(ctx)
+ o.lower.DecRef(ctx)
}
// type overlayFilesystem is the filesystem for overlay mounts.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index a3d10770b..6c296f5d0 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -18,6 +18,7 @@ import (
"fmt"
"testing"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
)
@@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
return false
}
-func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
+func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error {
gotPaths := make(map[string]struct{}, len(got))
gotStr := make([]string, len(got))
for i, g := range got {
if groot := g.Root(); groot != nil {
name, _ := groot.FullName(root)
- groot.DecRef()
+ groot.DecRef(ctx)
gotStr[i] = name
gotPaths[name] = struct{}{}
}
@@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Get a child of the root which we will mount over. Note that the
// MockInodeOperations causes Walk to always succeed.
@@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Add mounts at the following paths:
paths := []string{
@@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) {
if err := mm.Mount(ctx, d, submountInode); err != nil {
t.Fatalf("could not mount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
}
// mm root should contain all submounts (and does not include the root mount).
rootMnt := mm.FindMount(rootDirent)
submounts := mm.AllMountsUnder(rootMnt)
allPaths := append(paths, "/")
- if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil {
t.Error(err)
}
@@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) {
if err != nil {
t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
submounts = mm.AllMountsUnder(mm.FindMount(d))
- if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
t.Error(err)
}
@@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) {
if err != nil {
t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
}
- defer waldo.DecRef()
+ defer waldo.DecRef(ctx)
submounts = mm.AllMountsUnder(mm.FindMount(waldo))
- if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil {
t.Error(err)
}
}
@@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Add mounts at the following paths:
paths := []string{
@@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) {
if err := mm.Mount(ctx, d, submountInode); err != nil {
t.Fatalf("could not mount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
}
allPaths := make([]string, len(paths)+1)
@@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) {
if err := mm.Unmount(ctx, d, false); err != nil {
t.Fatalf("could not unmount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
// Remove the path that has been unmounted and the check that the remaining
// mounts are still there.
allPaths = allPaths[:len(allPaths)-1]
submounts := mm.AllMountsUnder(rootMnt)
- if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil {
t.Error(err)
}
}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 3f2bd0e87..d741c4339 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -234,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
// After destroy is called, the MountNamespace may continue to be referenced (for
// example via /proc/mounts), but should free all resources and shouldn't have
// Find* methods called.
-func (mns *MountNamespace) destroy() {
+func (mns *MountNamespace) destroy(ctx context.Context) {
mns.mu.Lock()
defer mns.mu.Unlock()
@@ -247,13 +247,13 @@ func (mns *MountNamespace) destroy() {
for _, mp := range mns.mounts {
// Drop the mount reference on all mounted dirents.
for ; mp != nil; mp = mp.previous {
- mp.root.DecRef()
+ mp.root.DecRef(ctx)
}
}
mns.mounts = nil
// Drop reference on the root.
- mns.root.DecRef()
+ mns.root.DecRef(ctx)
// Ensure that root cannot be accessed via this MountNamespace any
// more.
@@ -265,8 +265,8 @@ func (mns *MountNamespace) destroy() {
}
// DecRef implements RefCounter.DecRef with destructor mns.destroy.
-func (mns *MountNamespace) DecRef() {
- mns.DecRefWithDestructor(mns.destroy)
+func (mns *MountNamespace) DecRef(ctx context.Context) {
+ mns.DecRefWithDestructor(ctx, mns.destroy)
}
// withMountLocked prevents further walks to `node`, because `node` is about to
@@ -312,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode
if err != nil {
return err
}
- defer replacement.DecRef()
+ defer replacement.DecRef(ctx)
// Set the mount's root dirent and id.
parentMnt := mns.findMountLocked(mountPoint)
@@ -394,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev))
}
// Drop mount reference taken at the end of MountNamespace.Mount.
- prev.root.DecRef()
+ prev.root.DecRef(ctx)
} else {
mns.mounts[prev.root] = prev
}
@@ -496,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
// non-directory root is hopeless.
if current != root {
if !IsDir(current.Inode.StableAttr) {
- current.DecRef() // Drop reference from above.
+ current.DecRef(ctx) // Drop reference from above.
return nil, syserror.ENOTDIR
}
if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
- current.DecRef() // Drop reference from above.
+ current.DecRef(ctx) // Drop reference from above.
return nil, err
}
}
@@ -511,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
// Allow failed walks to cache the dirent, because no
// children will acquire a reference at the end.
current.maybeExtendReference()
- current.DecRef()
+ current.DecRef(ctx)
return nil, err
}
// Drop old reference.
- current.DecRef()
+ current.DecRef(ctx)
if remainder != "" {
// Ensure it's resolved, unless it's the last level.
@@ -570,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
case nil:
// Make sure we didn't exhaust the traversal budget.
if *remainingTraversals == 0 {
- target.DecRef()
+ target.DecRef(ctx)
return nil, syscall.ELOOP
}
- node.DecRef() // Drop the original reference.
+ node.DecRef(ctx) // Drop the original reference.
return target, nil
case syscall.ENOLINK:
@@ -582,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
return node, nil
case ErrResolveViaReadlink:
- defer node.DecRef() // See above.
+ defer node.DecRef(ctx) // See above.
// First, check if we should traverse.
if *remainingTraversals == 0 {
@@ -608,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
return d, err
default:
- node.DecRef() // Drop for err; see above.
+ node.DecRef(ctx) // Drop for err; see above.
// Propagate the error.
return nil, err
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index a69b41468..975d6cbc9 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) {
}
root := mm.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
foo, err := root.Walk(ctx, root, "foo")
if err != nil {
t.Fatalf("Error walking to foo: %v", err)
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index a8ae7d81d..01a1235b8 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool {
// NewOverlayRoot produces the root of an overlay.
//
// Preconditions:
-//
-// - upper and lower must be non-nil.
-// - upper must not be an overlay.
-// - lower should not expose character devices, pipes, or sockets, because
+// * upper and lower must be non-nil.
+// * upper must not be an overlay.
+// * lower should not expose character devices, pipes, or sockets, because
// copying up these types of files is not supported.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
if !IsDir(upper.StableAttr) {
return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
@@ -107,7 +106,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags)
overlay, err := newOverlayEntry(ctx, upper, lower, true)
if err != nil {
- msrc.DecRef()
+ msrc.DecRef(ctx)
return nil, err
}
@@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
// NewOverlayRootFile produces the root of an overlay that points to a file.
//
// Preconditions:
-//
-// - lower must be non-nil.
-// - lower should not expose character devices, pipes, or sockets, because
+// * lower must be non-nil.
+// * lower should not expose character devices, pipes, or sockets, because
// copying up these types of files is not supported. Neither it can be a dir.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
if !IsRegular(lower.StableAttr) {
return nil, fmt.Errorf("lower Inode is not a regular file")
@@ -130,7 +128,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode,
msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags)
overlay, err := newOverlayEntry(ctx, nil, lower, true)
if err != nil {
- msrc.DecRef()
+ msrc.DecRef(ctx)
return nil, err
}
return newOverlayInode(ctx, overlay, msrc), nil
@@ -230,16 +228,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist
}, nil
}
-func (o *overlayEntry) release() {
+func (o *overlayEntry) release(ctx context.Context) {
// We drop a reference on upper and lower file system Inodes
// rather than releasing them, because in-memory filesystems
// may hold an extra reference to these Inodes so that they
// stay in memory.
if o.upper != nil {
- o.upper.DecRef()
+ o.upper.DecRef(ctx)
}
if o.lower != nil {
- o.lower.DecRef()
+ o.lower.DecRef(ctx)
}
}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 77c2c5c0e..b8b2281a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -50,6 +50,7 @@ go_library(
"//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
+ "//pkg/tcpip/network/ipv4",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 35972e23c..45523adf8 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
// readDescriptors reads fds in the task starting at offset, and calls the
// toDentAttr callback for each to get a DentAttr, which it then emits. This is
// a helper for implementing fs.InodeOperations.Readdir.
-func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
+func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
var fds []int32
t.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.GetFDs()
+ fds = fdTable.GetFDs(ctx)
}
})
@@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error
func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
n, _ := f.file.Dirent.FullName(root)
return n, nil
@@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error {
func (f *fd) Release(ctx context.Context) {
f.Symlink.Release(ctx)
- f.file.DecRef()
-}
-
-// Close releases the reference on the file.
-func (f *fd) Close() error {
- f.file.DecRef()
- return nil
+ f.file.DecRef(ctx)
}
// fdDir is an InodeOperations for /proc/TID/fd.
@@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer
if f.isInfoFile {
typ = fs.Symlink
}
- return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
+ return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
return fs.GenericDentAttr(typ, device.ProcDevice)
})
}
@@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs
// locks, and other data. For now we only have flags.
// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
- file.DecRef()
+ file.DecRef(ctx)
contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags))
return newStaticProcInode(ctx, dir.MountSource, contents)
})
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 1fc9c703c..6a63c47b3 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
// The task has been destroyed. Nothing to show here.
return
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(t)
mnt := t.MountNamespace().FindMount(rootDir)
if mnt == nil {
@@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
continue // No longer valid.
}
mountPath, desc := mroot.FullName(rootDir)
- mroot.DecRef()
+ mroot.DecRef(t)
if !desc {
// MountSources that are not descendants of the chroot jail are ignored.
continue
@@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
if mroot == nil {
return // No longer valid.
}
- defer mroot.DecRef()
+ defer mroot.DecRef(ctx)
// Format:
// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
@@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
if root == nil {
return // No longer valid.
}
- defer root.DecRef()
+ defer root.DecRef(ctx)
flags := root.Inode.MountSource.Flags
opts := "rw"
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index bd18177d4..83a43aa26 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
}
sfile := s.(*fs.File)
if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
- s.DecRef()
+ s.DecRef(ctx)
// Not a unix socket.
continue
}
@@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
}
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
@@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
}
if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
- s.DecRef()
+ s.DecRef(ctx)
// Not tcp4 sockets.
continue
}
@@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
@@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
}
if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
- s.DecRef()
+ s.DecRef(ctx)
// Not udp4 socket.
continue
}
@@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c659224a7..77e0e1d26 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent
// Add dot and dotdot.
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dot, dotdot := file.Dirent.GetDotAttrs(root)
names = append(names, ".", "..")
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 702fdd392..e555672ad 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -54,7 +55,7 @@ type tcpMemInode struct {
// size stores the tcp buffer size during save, and sets the buffer
// size in netstack in restore. We must save/restore this here, since
- // netstack itself is stateless.
+ // a netstack instance is created on restore.
size inet.TCPBufferSize
// mu protects against concurrent reads/writes to files based on this
@@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
if src.NumBytes() == 0 {
return 0, nil
}
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
src = src.TakeFirst(usermem.PageSize - 1)
var v int32
@@ -272,6 +276,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled)
}
+// +stateify savable
+type tcpRecovery struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+ recovery inet.TCPLossRecovery
+}
+
+func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ts := &tcpRecovery{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ts, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{
+ tcpRecovery: r,
+ stack: r.stack,
+ }), nil
+}
+
+// +stateify savable
+type tcpRecoveryFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ tcpRecovery *tcpRecovery
+
+ stack inet.Stack `state:"wait"`
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ recovery, err := f.stack.TCPRecovery()
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = recovery
+ s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery)
+ n, err := dst.CopyOut(ctx, []byte(s))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = inet.TCPLossRecovery(v)
+ if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -293,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
+// ipForwarding implements fs.InodeOperations.
+//
+// ipForwarding is used to enable/disable packet forwarding of netstack.
+//
+// +stateify savable
+type ipForwarding struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+
+ // enabled stores the IPv4 forwarding state on save.
+ // We must save/restore this here, since a netstack instance
+ // is created on restore.
+ enabled *bool
+}
+
+func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ipf := &ipForwarding{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ipf, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// +stateify savable
+type ipForwardingFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ ipf *ipForwarding
+
+ stack inet.Stack `state:"wait"`
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{
+ stack: ipf.stack,
+ ipf: ipf,
+ }), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ if f.ipf.enabled == nil {
+ enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
+ f.ipf.enabled = &enabled
+ }
+
+ val := "0\n"
+ if *f.ipf.enabled {
+ // Technically, this is not quite compatible with Linux. Linux
+ // stores these as an integer, so if you write "2" into
+ // ip_forward, you should get 2 back.
+ val = "1\n"
+ }
+ n, err := dst.CopyOut(ctx, []byte(val))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Offset is ignored, multiple writes are not supported.
+func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Only consider size of one memory page for input for performance reasons.
+ // We are only reading if it's zero or not anyway.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return n, err
+ }
+ if f.ipf.enabled == nil {
+ f.ipf.enabled = new(bool)
+ }
+ *f.ipf.enabled = v != 0
+ return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+}
+
func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
contents := map[string]*fs.Inode{
// Add tcp_sack.
"tcp_sack": newTCPSackInode(ctx, msrc, s),
+ // Add ip_forward.
+ "ip_forward": newIPForwardingInode(ctx, msrc, s),
+
// The following files are simple stubs until they are
// implemented in netstack, most of these files are
// configuration related. We use the value closest to the
@@ -351,6 +559,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem)
}
+ // Add tcp_recovery.
+ if _, err := s.TCPRecovery(); err == nil {
+ contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s)
+ }
+
d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 6eba709c6..4cb4741af 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -14,7 +14,11 @@
package proc
-import "fmt"
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+)
// beforeSave is invoked by stateify.
func (t *tcpMemInode) beforeSave() {
@@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() {
}
}
}
+
+// afterLoad is invoked by stateify.
+func (ipf *ipForwarding) afterLoad() {
+ if ipf.enabled != nil {
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
+ }
+ }
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 355e83d47..6ef5738e7 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) {
}
}
}
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestIPForwarding(t *testing.T) {
+ ctx := context.Background()
+ s := inet.NewTestStack()
+
+ var cases = []struct {
+ comment string
+ initial bool
+ str string
+ final bool
+ }{
+ {
+ comment: `Forwarding is disabled; write 1 and enable forwarding`,
+ initial: false,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is disabled; write 0 and disable forwarding`,
+ initial: false,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is enabled; write 1 and enable forwarding`,
+ initial: true,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 0 and disable forwarding`,
+ initial: true,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+ initial: false,
+ str: "2404",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+ initial: true,
+ str: "2404",
+ final: true,
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.comment, func(t *testing.T) {
+ s.IPForwarding = c.initial
+ ipf := &ipForwarding{stack: s}
+ file := &ipForwardingFile{
+ stack: s,
+ ipf: ipf,
+ }
+
+ // Write the values.
+ src := usermem.BytesIOSequence([]byte(c.str))
+ if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+ }
+
+ // Read the values from the stack and check them.
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+ }
+
+ })
+ }
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4bbe90198..450044c9c 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
"auxv": newAuxvec(t, msrc),
"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
"comm": newComm(t, msrc),
+ "cwd": newCwd(t, msrc),
"environ": newExecArgInode(t, msrc, environExecArg),
"exe": newExe(t, msrc),
"fd": newFdDir(t, msrc),
@@ -91,6 +92,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
"gid_map": newGIDMap(t, msrc),
"io": newIO(t, msrc, isThreadGroup),
"maps": newMaps(t, msrc),
+ "mem": newMem(t, msrc),
"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
"net": newNetDir(t, msrc),
@@ -185,7 +187,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry
// Serialize "." and "..".
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dot, dotdot := file.Dirent.GetDotAttrs(root)
if err := dirCtx.DirEmit(".", dot); err != nil {
@@ -295,11 +297,54 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
if err != nil {
return "", err
}
- defer exec.DecRef()
+ defer exec.DecRef(ctx)
return exec.PathnameWithDeleted(ctx), nil
}
+// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
+//
+// +stateify savable
+type cwd struct {
+ ramfs.Symlink
+
+ t *kernel.Task
+}
+
+func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+ cwdSymlink := &cwd{
+ Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+ t: t,
+ }
+ return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t)
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+ if !kernel.ContextCanTrace(ctx, e.t, false) {
+ return "", syserror.EACCES
+ }
+ if err := checkTaskState(e.t); err != nil {
+ return "", err
+ }
+ cwd := e.t.FSContext().WorkingDirectory()
+ if cwd == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer cwd.DecRef(ctx)
+
+ root := fs.RootFromContext(ctx)
+ if root == nil {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ name, _ := cwd.FullName(root)
+ return name, nil
+}
+
// namespaceSymlink represents a symlink in the namespacefs, such as the files
// in /proc/<pid>/ns.
//
@@ -355,6 +400,88 @@ func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
return newProcInode(t, d, msrc, fs.SpecialDirectory, t)
}
+// memData implements fs.Inode for /proc/[pid]/mem.
+//
+// +stateify savable
+type memData struct {
+ fsutil.SimpleFileInode
+
+ t *kernel.Task
+}
+
+// memDataFile implements fs.FileOperations for /proc/[pid]/mem.
+//
+// +stateify savable
+type memDataFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoWrite `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ t *kernel.Task
+}
+
+func newMem(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+ inode := &memData{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0400), linux.PROC_SUPER_MAGIC),
+ t: t,
+ }
+ return newProcInode(t, inode, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (m *memData) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (m *memData) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
+ // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
+ // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
+ if !kernel.ContextCanTrace(ctx, m.t, true) {
+ return nil, syserror.EACCES
+ }
+ if err := checkTaskState(m.t); err != nil {
+ return nil, err
+ }
+ // Enable random access reads
+ flags.Pread = true
+ return fs.NewFile(ctx, dirent, flags, &memDataFile{t: m.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+ mm, err := getTaskMM(m.t)
+ if err != nil {
+ return 0, nil
+ }
+ defer mm.DecUsers(ctx)
+ // Buffer the read data because of MM locks
+ buf := make([]byte, dst.NumBytes())
+ n, readErr := mm.CopyIn(ctx, usermem.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
+ if n > 0 {
+ if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
+ return 0, syserror.EFAULT
+ }
+ return int64(n), nil
+ }
+ if readErr != nil {
+ return 0, syserror.EIO
+ }
+ return 0, nil
+}
+
// mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
//
// +stateify savable
@@ -604,7 +731,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
var vss, rss, data uint64
s.t.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index bfa304552..f4fcddecb 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
}
// Remove our reference on the inode.
- inode.DecRef()
+ inode.DecRef(ctx)
return nil
}
@@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err
}
// Remove our reference on the inode.
- inode.DecRef()
+ inode.DecRef(ctx)
return nil
}
@@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F
// Create the Dirent and corresponding file.
created := fs.NewDirent(ctx, inode, name)
- defer created.DecRef()
+ defer created.DecRef(ctx)
return created.Inode.GetFile(ctx, created, flags)
}
@@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol
}
// Release implements fs.InodeOperation.Release.
-func (d *Dir) Release(_ context.Context) {
+func (d *Dir) Release(ctx context.Context) {
// Drop references on all children.
d.mu.Lock()
for _, i := range d.children {
- i.DecRef()
+ i.DecRef(ctx)
}
d.mu.Unlock()
}
@@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC
func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
@@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) {
// dropped when that dirent is destroyed.
inode.IncRef()
d := fs.NewTransientDirent(inode)
- defer d.DecRef()
+ defer d.DecRef(ctx)
file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true})
if err != nil {
return false, err
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
ser := &fs.CollectEntriesSerializer{}
if err := file.Readdir(ctx, ser); err != nil {
@@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
if err != nil {
return err
}
- inode.DecRef()
+ inode.DecRef(ctx)
}
// Be careful, we may have already grabbed this mutex above.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index a6ed8b2c5..3e0d1e07e 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) {
continue
}
root := mm.Root()
- defer mm.DecRef()
+ defer mm.DecRef(ctx)
for _, p := range test.subdirs {
maxTraversals := uint(0)
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 88c344089..f362ca9b6 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -55,7 +55,7 @@ type TimerOperations struct {
func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]")
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
tops := &TimerOperations{}
tops.timer = ktime.NewTimer(c, tops)
// Timerfds reject writes, but the Write flag must be set in order to
@@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
}
// Release implements fs.FileOperations.Release.
-func (t *TimerOperations) Release() {
+func (t *TimerOperations) Release(context.Context) {
t.timer.Destroy()
}
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index aaba35502..d4d613ea9 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File {
func TestGrow(t *testing.T) {
ctx := contexttest.Context(t)
f := newFile(ctx)
- defer f.DecRef()
+ defer f.DecRef(ctx)
abuf := bytes.Repeat([]byte{'a'}, 68)
n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0)
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1dc75291d..fc0498f17 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
}
mf := f.kernel.MemoryFile()
- cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index b095312fe..998b697ca 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -16,6 +16,8 @@
package tmpfs
import (
+ "math"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -32,9 +34,15 @@ import (
var fsInfo = fs.Info{
Type: linux.TMPFS_MAGIC,
+ // tmpfs currently does not support configurable size limits. In Linux,
+ // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+ // statfs(2). However, many applications treat this as having a size limit
+ // of 0. To work around this, claim to have a very large but non-zero size,
+ // chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+ // applications may also handle incorrectly).
// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
- TotalBlocks: 0,
- FreeBlocks: 0,
+ TotalBlocks: math.MaxInt64 / usermem.PageSize,
+ FreeBlocks: math.MaxInt64 / usermem.PageSize,
}
// rename implements fs.InodeOperations.Rename for tmpfs nodes.
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..e6d0eb359 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,13 +10,14 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
- "slave.go",
+ "replica.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 108654827..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
// This indirectly manages all terminals within the mount.
//
// New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
// is no longer discoverable at that point.
//
// References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
//
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
// dirInodeOperations, which is reference counted by the refcount their
// corresponding Dirents hold on their parent (this directory).
//
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
// master is the master PTY inode.
master *fs.Inode
- // slaves contains the slave inodes reachable from the directory.
+ // replicas contains the replica inodes reachable from the directory.
//
- // A new slave is added by allocateTerminal and is removed by
+ // A new replica is added by allocateTerminal and is removed by
// masterFileOperations.Release.
//
- // A reference is held on every slave in the map.
- slaves map[uint32]*fs.Inode
+ // A reference is held on every replica in the map.
+ replicas map[uint32]*fs.Inode
// dentryMap is a SortedDentryMap used to implement Readdir containing
- // the master and all entries in slaves.
+ // the master and all entries in replicas.
dentryMap *fs.SortedDentryMap
// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
d := &dirInodeOperations{
InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
msrc: m,
- slaves: make(map[uint32]*fs.Inode),
+ replicas: make(map[uint32]*fs.Inode),
dentryMap: fs.NewSortedDentryMap(nil),
}
// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -132,8 +132,8 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
d.mu.Lock()
defer d.mu.Unlock()
- d.master.DecRef()
- if len(d.slaves) != 0 {
+ d.master.DecRef(ctx)
+ if len(d.replicas) != 0 {
panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
}
}
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
return fs.NewDirent(ctx, d.master, name), nil
}
- // Slave number?
+ // Replica number?
n, err := strconv.ParseUint(name, 10, 32)
if err != nil {
// Not found.
return nil, syserror.ENOENT
}
- s, ok := d.slaves[uint32(n)]
+ s, ok := d.replicas[uint32(n)]
if !ok {
return nil, syserror.ENOENT
}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
return nil, syserror.ENOMEM
}
- if _, ok := d.slaves[n]; ok {
+ if _, ok := d.replicas[n]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", n))
}
@@ -244,41 +244,41 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
d.next++
// The reference returned by newTerminal is returned to the caller.
- // Take another for the slave inode.
+ // Take another for the replica inode.
t.IncRef()
// Create a pts node. The owner is based on the context that opens
// ptmx.
creds := auth.CredentialsFromContext(ctx)
uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
- slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+ replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
- d.slaves[n] = slave
+ d.replicas[n] = replica
d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
- Type: slave.StableAttr.Type,
- InodeID: slave.StableAttr.InodeID,
+ Type: replica.StableAttr.Type,
+ InodeID: replica.StableAttr.InodeID,
})
return t, nil
}
// masterClose is called when the master end of t is closed.
-func (d *dirInodeOperations) masterClose(t *Terminal) {
+func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
d.mu.Lock()
defer d.mu.Unlock()
- // The slave end disappears from the directory when the master end is
- // closed, even if the slave end is open elsewhere.
+ // The replica end disappears from the directory when the master end is
+ // closed, even if the replica end is open elsewhere.
//
// N.B. since we're using a backdoor method to remove a directory entry
// we won't properly fire inotify events like Linux would.
- s, ok := d.slaves[t.n]
+ s, ok := d.replicas[t.n]
if !ok {
panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
}
- s.DecRef()
- delete(d.slaves, t.n)
+ s.DecRef(ctx)
+ delete(d.replicas, t.n)
d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
}
@@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt
func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 8fe05ebe5..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
//
// It always returns true, forcing a Lookup for all entries.
//
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
// still exists on the filesystem.
func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
return true
@@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {}
func (superOperations) SaveInodeMapping(*fs.Inode, string) {}
// Destroy implements MountSourceOperations.Destroy.
-func (superOperations) Destroy() {}
+func (superOperations) Destroy(context.Context) {}
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -43,7 +44,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -54,8 +55,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -64,7 +65,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -103,8 +104,8 @@ type lineDiscipline struct {
// masterWaiter is used to wait on the master end of the TTY.
masterWaiter waiter.Queue `state:"zerovalue"`
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue `state:"zerovalue"`
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index fe07fa929..b91184b1b 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -75,7 +77,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
}
// Release implements fs.InodeOperations.Release.
-func (mi *masterInodeOperations) Release(ctx context.Context) {
+func (mi *masterInodeOperations) Release(context.Context) {
}
// Truncate implements fs.InodeOperations.Truncate.
@@ -120,9 +122,9 @@ type masterFileOperations struct {
var _ fs.FileOperations = (*masterFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (mf *masterFileOperations) Release() {
- mf.d.masterClose(mf.t)
- mf.t.DecRef()
+func (mf *masterFileOperations) Release(ctx context.Context) {
+ mf.d.masterClose(ctx, mf.t)
+ mf.t.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
// Ioctl implements fs.FileOperations.Ioctl.
func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mf.t.ld.outputQueueReadSize(t, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mf.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mf.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mf.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mf.t.ld.setTermios(ctx, io, args)
+ return mf.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mf.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mf.t.ld.windowSize(ctx, io, args)
+ return 0, mf.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mf.t.ld.setWindowSize(ctx, io, args)
+ return 0, mf.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index ceabb9b1e..79975d812 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -17,8 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -32,7 +34,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
@@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
// as whether the read caused more readable data to become available (whether
// data was pushed from the wait buffer to the read buffer).
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
// write writes to q from userspace.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
// writeBytes writes to q from b.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
q.mu.Lock()
defer q.mu.Unlock()
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go
index 9871f6fc6..385d230fb 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/replica.go
@@ -17,9 +17,11 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
@@ -27,11 +29,11 @@ import (
// LINT.IfChange
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
// Terminal (pts file).
//
// +stateify savable
-type slaveInodeOperations struct {
+type replicaInodeOperations struct {
fsutil.SimpleFileInode
// d is the containing dir.
@@ -41,13 +43,13 @@ type slaveInodeOperations struct {
t *Terminal
}
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
//
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
- iops := &slaveInodeOperations{
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+ iops := &replicaInodeOperations{
SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
d: d,
t: t,
@@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
Type: fs.CharacterDevice,
// See fs/devpts/inode.c:devpts_fill_super.
BlockSize: 1024,
- DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+ DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
DeviceFileMinor: t.n,
})
}
// Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
- si.t.DecRef()
+func (si *replicaInodeOperations) Release(ctx context.Context) {
+ si.t.DecRef(ctx)
}
// Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
return nil
}
@@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
//
// This may race with destruction of the terminal. If the terminal is gone, it
// returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
- return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
}
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
//
// +stateify savable
-type slaveFileOperations struct {
+type replicaFileOperations struct {
fsutil.FilePipeSeek `state:"nosave"`
fsutil.FileNotDirReaddir `state:"nosave"`
fsutil.FileNoFsync `state:"nosave"`
@@ -100,79 +103,84 @@ type slaveFileOperations struct {
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
// si is the inode operations.
- si *slaveInodeOperations
+ si *replicaInodeOperations
}
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release() {
+func (sf *replicaFileOperations) Release(context.Context) {
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
- sf.si.t.ld.slaveWaiter.EventUnregister(e)
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+ sf.si.t.ld.replicaWaiter.EventUnregister(e)
}
// Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sf.si.t.ld.slaveReadiness()
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return sf.si.t.ld.replicaReadiness()
}
// Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.inputQueueRead(ctx, dst)
}
// Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
return sf.si.t.ld.outputQueueWrite(ctx, src)
}
// Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the input queue read buffer.
- return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+ return 0, sf.si.t.ld.inputQueueReadSize(t, args)
case linux.TCGETS:
- return sf.si.t.ld.getTermios(ctx, io, args)
+ return sf.si.t.ld.getTermios(t, args)
case linux.TCSETS:
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return sf.si.t.ld.setTermios(ctx, io, args)
+ return sf.si.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(sf.si.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCGWINSZ:
- return 0, sf.si.t.ld.windowSize(ctx, io, args)
+ return 0, sf.si.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+ return 0, sf.si.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
}
}
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..4f431d74d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
@@ -44,19 +44,19 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- d: d,
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ d: d,
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
t.EnableLeakCheck("tty.Terminal")
return &t
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
// LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
index bd5dac373..66e949c95 100644
--- a/pkg/sentry/fs/user/BUILD
+++ b/pkg/sentry/fs/user/BUILD
@@ -15,6 +15,7 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/sentry/fs",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
"//pkg/syserror",
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
index fbd4547a7..124bc95ed 100644
--- a/pkg/sentry/fs/user/path.go
+++ b/pkg/sentry/fs/user/path.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -31,7 +32,15 @@ import (
// ResolveExecutablePath resolves the given executable name given the working
// dir and environment.
-func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs.MountNamespace, envv []string, wd, name string) (string, error) {
+func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) {
+ name := args.Filename
+ if len(name) == 0 {
+ if len(args.Argv) == 0 {
+ return "", fmt.Errorf("no filename or command provided")
+ }
+ name = args.Argv[0]
+ }
+
// Absolute paths can be used directly.
if path.IsAbs(name) {
return name, nil
@@ -40,6 +49,7 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs
// Paths with '/' in them should be joined to the working directory, or
// to the root if working directory is not set.
if strings.IndexByte(name, '/') > 0 {
+ wd := args.WorkingDirectory
if wd == "" {
wd = "/"
}
@@ -49,16 +59,30 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs
return path.Join(wd, name), nil
}
- // Otherwise, We must lookup the name in the paths, starting from the
- // calling context's root directory.
- paths := getPath(envv)
+ // Otherwise, We must lookup the name in the paths.
+ paths := getPath(args.Envv)
+ if kernel.VFS2Enabled {
+ f, err := resolveVFS2(ctx, args.Credentials, args.MountNamespaceVFS2, paths, name)
+ if err != nil {
+ return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+ }
+ return f, nil
+ }
+
+ f, err := resolve(ctx, args.MountNamespace, paths, name)
+ if err != nil {
+ return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+ }
+ return f, nil
+}
+func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name string) (string, error) {
root := fs.RootFromContext(ctx)
if root == nil {
// Caller has no root. Don't bother traversing anything.
return "", syserror.ENOENT
}
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range paths {
if !path.IsAbs(p) {
// Relative paths aren't safe, no one should be using them.
@@ -76,7 +100,7 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs
if err != nil {
return "", err
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
// Check that it is a regular file.
if !fs.IsRegular(d.Inode.StableAttr) {
@@ -95,32 +119,10 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs
return "", syserror.ENOENT
}
-// ResolveExecutablePathVFS2 resolves the given executable name given the
-// working dir and environment.
-func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, envv []string, wd, name string) (string, error) {
- // Absolute paths can be used directly.
- if path.IsAbs(name) {
- return name, nil
- }
-
- // Paths with '/' in them should be joined to the working directory, or
- // to the root if working directory is not set.
- if strings.IndexByte(name, '/') > 0 {
- if wd == "" {
- wd = "/"
- }
- if !path.IsAbs(wd) {
- return "", fmt.Errorf("working directory %q must be absolute", wd)
- }
- return path.Join(wd, name), nil
- }
-
- // Otherwise, We must lookup the name in the paths, starting from the
- // calling context's root directory.
- paths := getPath(envv)
-
+func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
root := mns.Root()
- defer root.DecRef()
+ root.IncRef()
+ defer root.DecRef(ctx)
for _, p := range paths {
if !path.IsAbs(p) {
// Relative paths aren't safe, no one should be using them.
@@ -147,7 +149,7 @@ func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns
if err != nil {
return "", err
}
- dentry.DecRef()
+ dentry.DecRef(ctx)
return binPath, nil
}
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index f4d525523..1f8684dc6 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -62,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
// doesn't exist we will return the default home directory.
return defaultHome, nil
}
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
// Check read permissions on the file.
if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
@@ -81,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
if err != nil {
return "", err
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
r := &fileReader{
Ctx: ctx,
@@ -105,7 +105,8 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
const defaultHome = "/"
root := mns.Root()
- defer root.DecRef()
+ root.IncRef()
+ defer root.DecRef(ctx)
creds := auth.CredentialsFromContext(ctx)
@@ -123,7 +124,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
if err != nil {
return defaultHome, nil
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
r := &fileReaderVFS2{
ctx: ctx,
diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go
index 7d8e9ac7c..12b786224 100644
--- a/pkg/sentry/fs/user/user_test.go
+++ b/pkg/sentry/fs/user/user_test.go
@@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode
if err != nil {
return err
}
- defer etc.DecRef()
+ defer etc.DecRef(ctx)
switch mode.FileType() {
case 0:
// Don't create anything.
@@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode
if err != nil {
return err
}
- defer passwd.DecRef()
+ defer passwd.DecRef(ctx)
if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil {
return err
}
@@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) {
if err != nil {
t.Fatalf("NewMountNamespace failed: %v", err)
}
- defer mns.DecRef()
+ defer mns.DecRef(ctx)
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
ctx = fs.WithRoot(ctx, root)
if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil {
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
index 8e7590721..7e61209ee 100644
--- a/pkg/sentry/fsbridge/bridge.go
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -44,7 +44,7 @@ type File interface {
IncRef()
// DecRef decrements reference.
- DecRef()
+ DecRef(ctx context.Context)
}
// Lookup provides a common interface to open files.
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
index 093ce1fb3..9785fd62a 100644
--- a/pkg/sentry/fsbridge/fs.go
+++ b/pkg/sentry/fsbridge/fs.go
@@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
// global there.
return ""
}
- defer root.DecRef()
+ defer root.DecRef(ctx)
name, _ := f.file.Dirent.FullName(root)
return name
@@ -87,8 +87,8 @@ func (f *fsFile) IncRef() {
}
// DecRef implements File.
-func (f *fsFile) DecRef() {
- f.file.DecRef()
+func (f *fsFile) DecRef(ctx context.Context) {
+ f.file.DecRef(ctx)
}
// fsLookup implements Lookup interface using fs.File.
@@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio
if err != nil {
return nil, err
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
return nil, syserror.ELOOP
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 89168220a..be0900030 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File {
// PathnameWithDeleted implements File.
func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
root := vfs.RootFromContext(ctx)
- defer root.DecRef()
+ defer root.DecRef(ctx)
vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
@@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() {
}
// DecRef implements File.
-func (f *VFSFile) DecRef() {
- f.file.DecRef()
+func (f *VFSFile) DecRef(ctx context.Context) {
+ f.file.DecRef(ctx)
}
// FileDescription returns the FileDescription represented by f. It does not
@@ -122,7 +122,7 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
// remainingTraversals is not configurable in VFS2, all callers are using the
// default anyways.
func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
- vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+ vfsObj := l.root.Mount().Filesystem().VirtualFilesystem()
creds := auth.CredentialsFromContext(ctx)
path := fspath.Parse(pathname)
pop := &vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 585764223..6af3c3781 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "root_inode_refs",
+ out = "root_inode_refs.go",
+ package = "devpts",
+ prefix = "rootInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "rootInode",
+ },
+)
+
go_library(
name = "devpts",
srcs = [
@@ -9,15 +21,23 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
- "slave.go",
+ "replica.go",
+ "root_inode_refs.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index c03c65445..9185877f6 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,29 +35,56 @@ import (
const Name = "devpts"
// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
+//
+// +stateify savable
+type FilesystemType struct {
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+ initErr error
+
+ // fs backs all mounts of this FilesystemType. root is fs' root. fs and root
+ // are immutable.
+ fs *vfs.Filesystem
+ root *vfs.Dentry
+}
// Name implements vfs.FilesystemType.Name.
-func (FilesystemType) Name() string {
+func (*FilesystemType) Name() string {
return Name
}
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
// No data allowed.
if opts.Data != "" {
return nil, nil, syserror.EINVAL
}
- fs, root, err := fstype.newFilesystem(vfsObj, creds)
- if err != nil {
- return nil, nil, err
+ fstype.initOnce.Do(func() {
+ fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds)
+ if err != nil {
+ fstype.initErr = err
+ return
+ }
+ fstype.fs = fs.VFSFilesystem()
+ fstype.root = root.VFSDentry()
+ })
+ if fstype.initErr != nil {
+ return nil, nil, fstype.initErr
}
- return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
+ fstype.fs.IncRef()
+ fstype.root.IncRef()
+ return fstype.fs, fstype.root, nil
}
+// Release implements vfs.FilesystemType.Release.
+func (fstype *FilesystemType) Release(ctx context.Context) {
+ if fstype.fs != nil {
+ fstype.root.DecRef(ctx)
+ fstype.fs.DecRef(ctx)
+ }
+}
+
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -66,7 +93,7 @@ type filesystem struct {
// newFilesystem creates a new devpts filesystem with root directory and ptmx
// master inode. It returns the filesystem and root Dentry.
-func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
devMinor, err := vfsObj.GetAnonBlockDevMinor()
if err != nil {
return nil, nil, err
@@ -79,57 +106,60 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
// Construct the root directory. This is always inode id 1.
root := &rootInode{
- slaves: make(map[uint32]*slaveInode),
+ replicas: make(map[uint32]*replicaInode),
}
- root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
+ root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- root.dentry.Init(root)
+ root.EnableLeakCheck()
+
+ var rootD kernfs.Dentry
+ rootD.Init(&fs.Filesystem, root)
// Construct the pts master inode and dentry. Linux always uses inode
// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
master := &masterInode{
root: root,
}
- master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
- master.dentry.Init(master)
+ master.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
// Add the master as a child of the root.
- links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
- "ptmx": &master.dentry,
+ links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
+ "ptmx": master,
})
root.IncLinks(links)
- return fs, &root.dentry, nil
+ return fs, &rootD, nil
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
type rootInode struct {
- kernfs.AlwaysValid
+ implStatFS
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
kernfs.OrderedChildren
+ rootInodeRefs
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
+ locks vfs.FileLocks
// master is the master pty inode. Immutable.
master *masterInode
- // root is the root directory inode for this filesystem. Immutable.
- root *rootInode
-
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
- // slaves maps pty ids to slave inodes.
- slaves map[uint32]*slaveInode
+ // replicas maps pty ids to replica inodes.
+ replicas map[uint32]*replicaInode
// nextIdx is the next pty index to use. Must be accessed atomically.
//
@@ -140,7 +170,7 @@ type rootInode struct {
var _ kernfs.Inode = (*rootInode)(nil)
// allocateTerminal creates a new Terminal and installs a pts node for it.
-func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) {
i.mu.Lock()
defer i.mu.Unlock()
if i.nextIdx == math.MaxUint32 {
@@ -149,41 +179,46 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
idx := i.nextIdx
i.nextIdx++
- // Sanity check that slave with idx does not exist.
- if _, ok := i.slaves[idx]; ok {
+ // Sanity check that replica with idx does not exist.
+ if _, ok := i.replicas[idx]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
}
- // Create the new terminal and slave.
+ // Create the new terminal and replica.
t := newTerminal(idx)
- slave := &slaveInode{
+ replica := &replicaInode{
root: i,
t: t,
}
// Linux always uses pty index + 3 as the inode id. See
// fs/devpts/inode.c:devpts_pty_new().
- slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
- slave.dentry.Init(slave)
- i.slaves[idx] = slave
+ replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+ i.replicas[idx] = replica
return t, nil
}
// masterClose is called when the master end of t is closed.
-func (i *rootInode) masterClose(t *Terminal) {
+func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
i.mu.Lock()
defer i.mu.Unlock()
- // Sanity check that slave with idx exists.
- if _, ok := i.slaves[t.n]; !ok {
+ // Sanity check that replica with idx exists.
+ ri, ok := i.replicas[t.n]
+ if !ok {
panic(fmt.Sprintf("pty with index %d does not exist", t.n))
}
- delete(i.slaves, t.n)
+
+ // Drop the ref on replica inode taken during rootInode.allocateTerminal.
+ ri.DecRef(ctx)
+ delete(i.replicas, t.n)
}
// Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
@@ -191,27 +226,34 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
}
// Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ // Check if a static entry was looked up.
+ if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+ return d, nil
+ }
+
+ // Not a static entry.
idx, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
}
i.mu.Lock()
defer i.mu.Unlock()
- if si, ok := i.slaves[uint32(idx)]; ok {
- si.dentry.IncRef()
- return si.dentry.VFSDentry(), nil
+ if ri, ok := i.replicas[uint32(idx)]; ok {
+ ri.IncRef() // This ref is passed to the dentry upon creation via Init.
+ return ri, nil
}
return nil, syserror.ENOENT
}
// IterDirents implements kernfs.Inode.IterDirents.
-func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
i.mu.Lock()
defer i.mu.Unlock()
- ids := make([]int, 0, len(i.slaves))
- for id := range i.slaves {
+ i.InodeAttrs.TouchAtime(ctx, mnt)
+ ids := make([]int, 0, len(i.replicas))
+ for id := range i.replicas {
ids = append(ids, int(id))
}
sort.Ints(ids)
@@ -219,7 +261,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
dirent := vfs.Dirent{
Name: strconv.FormatUint(uint64(id), 10),
Type: linux.DT_CHR,
- Ino: i.slaves[uint32(id)].InodeAttrs.Ino(),
+ Ino: i.replicas[uint32(id)].InodeAttrs.Ino(),
NextOff: offset + 1,
}
if err := cb.Handle(dirent); err != nil {
@@ -229,3 +271,16 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
}
return offset, nil
}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+ i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..ae95fdd08 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -52,8 +53,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -62,7 +63,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -99,10 +100,10 @@ type lineDiscipline struct {
column int
// masterWaiter is used to wait on the master end of the TTY.
- masterWaiter waiter.Queue `state:"zerovalue"`
+ masterWaiter waiter.Queue
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 7a7ce5d81..e91fa26a4 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,8 +17,11 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -28,14 +31,16 @@ import (
)
// masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
type masterInode struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
+ locks vfs.FileLocks
// root is the devpts root inode.
root *rootInode
@@ -44,27 +49,26 @@ type masterInode struct {
var _ kernfs.Inode = (*masterInode)(nil)
// Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- t, err := mi.root.allocateTerminal(rp.Credentials())
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ t, err := mi.root.allocateTerminal(ctx, rp.Credentials())
if err != nil {
return nil, err
}
- mi.IncRef()
fd := &masterFileDescription{
inode: mi,
t: t,
}
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- mi.DecRef()
+ fd.LockFD.Init(&mi.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
}
// Stat implements kernfs.Inode.Stat.
-func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -82,9 +86,11 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}
+// +stateify savable
type masterFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
inode *masterInode
t *Terminal
@@ -93,9 +99,8 @@ type masterFileDescription struct {
var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (mfd *masterFileDescription) Release() {
- mfd.inode.root.masterClose(mfd.t)
- mfd.inode.DecRef()
+func (mfd *masterFileDescription) Release(ctx context.Context) {
+ mfd.inode.root.masterClose(ctx, mfd.t)
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -125,46 +130,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mfd.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mfd.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mfd.t.ld.setTermios(ctx, io, args)
+ return mfd.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mfd.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mfd.t.ld.windowSize(ctx, io, args)
+ return 0, mfd.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+ return 0, mfd.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
@@ -181,7 +191,17 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO
// Stat implements vfs.FileDescriptionImpl.Stat.
func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return mfd.inode.Stat(fs, opts)
+ return mfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence)
}
// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
// as whether the read caused more readable data to become available (whether
// data was pushed from the wait buffer to the read buffer).
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
// write writes to q from userspace.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
// writeBytes writes to q from b.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
q.mu.Lock()
defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..70c68cf0a
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,201 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
+type replicaInode struct {
+ implStatFS
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+
+ locks vfs.FileLocks
+
+ // root is the devpts root inode.
+ root *rootInode
+
+ // t is the connected Terminal.
+ t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd := &replicaFileDescription{
+ inode: ri,
+ }
+ fd.LockFD.Init(&ri.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (ri *replicaInode) Valid(context.Context) bool {
+ // Return valid if the replica still exists.
+ ri.root.mu.Lock()
+ defer ri.root.mu.Unlock()
+ _, ok := ri.root.replicas[ri.t.n]
+ return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ statx.Blksize = 1024
+ statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+ statx.RdevMinor = ri.t.n
+ return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+ return syserror.EINVAL
+ }
+ return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+// +stateify savable
+type replicaFileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+ rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return rfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+ return rfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+ return rfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+ // Get the number of bytes in the input queue read buffer.
+ return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
+ case linux.TCGETS:
+ return rfd.inode.t.ld.getTermios(t, args)
+ case linux.TCSETS:
+ return rfd.inode.t.ld.setTermios(t, args)
+ case linux.TCSETSW:
+ // TODO(b/29356795): This should drain the output queue first.
+ return rfd.inode.t.ld.setTermios(t, args)
+ case linux.TIOCGPTN:
+ nP := primitive.Uint32(rfd.inode.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
+ return 0, err
+ case linux.TIOCGWINSZ:
+ return 0, rfd.inode.t.ld.windowSize(t, args)
+ case linux.TIOCSWINSZ:
+ return 0, rfd.inode.t.ld.setWindowSize(t, args)
+ case linux.TIOCSCTTY:
+ // Make the given terminal the controlling terminal of the
+ // calling process.
+ return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+ case linux.TIOCNOTTY:
+ // Release this process's controlling terminal.
+ return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+ case linux.TIOCGPGRP:
+ // Get the foreground process group.
+ return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+ case linux.TIOCSPGRP:
+ // Set the foreground process group.
+ return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+ default:
+ maybeEmitUnimplementedEvent(ctx, cmd)
+ return 0, syserror.ENOTTY
+ }
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ creds := auth.CredentialsFromContext(ctx)
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index 526cd406c..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
- kernfs.InodeAttrs
- kernfs.InodeNoopRefCount
- kernfs.InodeNotDirectory
- kernfs.InodeNotSymlink
-
- // Keep a reference to this inode's dentry.
- dentry kernfs.Dentry
-
- // root is the devpts root inode.
- root *rootInode
-
- // t is the connected Terminal.
- t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- si.IncRef()
- fd := &slaveFileDescription{
- inode: si,
- }
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- si.DecRef()
- return nil, err
- }
- return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
- // Return valid if the slave still exists.
- si.root.mu.Lock()
- defer si.root.mu.Unlock()
- _, ok := si.root.slaves[si.t.n]
- return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := si.InodeAttrs.Stat(vfsfs, opts)
- if err != nil {
- return linux.Statx{}, err
- }
- statx.Blksize = 1024
- statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
- statx.RdevMinor = si.t.n
- return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- if opts.Stat.Mask&linux.STATX_SIZE != 0 {
- return syserror.EINVAL
- }
- return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
- vfsfd vfs.FileDescription
- vfs.FileDescriptionDefaultImpl
-
- inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release() {
- sfd.inode.DecRef()
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
- sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
- return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
- return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescripionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
- switch cmd := args[1].Uint(); cmd {
- case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
- // Get the number of bytes in the input queue read buffer.
- return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
- case linux.TCGETS:
- return sfd.inode.t.ld.getTermios(ctx, io, args)
- case linux.TCSETS:
- return sfd.inode.t.ld.setTermios(ctx, io, args)
- case linux.TCSETSW:
- // TODO(b/29356795): This should drain the output queue first.
- return sfd.inode.t.ld.setTermios(ctx, io, args)
- case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
- return 0, err
- case linux.TIOCGWINSZ:
- return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
- case linux.TIOCSWINSZ:
- return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
- case linux.TIOCSCTTY:
- // Make the given terminal the controlling terminal of the
- // calling process.
- return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
- case linux.TIOCNOTTY:
- // Release this process's controlling terminal.
- return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
- case linux.TIOCGPGRP:
- // Get the foreground process group.
- return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
- case linux.TIOCSPGRP:
- // Set the foreground process group.
- return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
- default:
- maybeEmitUnimplementedEvent(ctx, cmd)
- return 0, syserror.ENOTTY
- }
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- creds := auth.CredentialsFromContext(ctx)
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.Stat(fs, opts)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
return &t
}
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..e49a04c1b 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -4,7 +4,10 @@ licenses(["notice"])
go_library(
name = "devtmpfs",
- srcs = ["devtmpfs.go"],
+ srcs = [
+ "devtmpfs.go",
+ "save_restore.go",
+ ],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
@@ -24,6 +27,7 @@ go_test(
library = ":devtmpfs",
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 142ee53b0..e6fe0fc0d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
import (
"fmt"
+ "path"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -32,8 +33,10 @@ import (
const Name = "devtmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct {
- initOnce sync.Once
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
initErr error
// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
@@ -68,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua
return fst.fs, fst.root, nil
}
+// Release implements vfs.FilesystemType.Release.
+func (fst *FilesystemType) Release(ctx context.Context) {
+ if fst.fs != nil {
+ // Release the original reference obtained when creating the filesystem.
+ fst.root.DecRef(ctx)
+ fst.fs.DecRef(ctx)
+ }
+}
+
// Accessor allows devices to create device special files in devtmpfs.
type Accessor struct {
vfsObj *vfs.VirtualFilesystem
@@ -79,22 +91,25 @@ type Accessor struct {
// NewAccessor returns an Accessor that supports creation of device special
// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
if err != nil {
return nil, err
}
+ // Pass a reference on root to the Accessor.
+ root := mntns.Root()
+ root.IncRef()
return &Accessor{
vfsObj: vfsObj,
mntns: mntns,
- root: mntns.Root(),
+ root: root,
creds: creds,
}, nil
}
// Release must be called when a is no longer in use.
-func (a *Accessor) Release() {
- a.root.DecRef()
- a.mntns.DecRef()
+func (a *Accessor) Release(ctx context.Context) {
+ a.root.DecRef(ctx)
+ a.mntns.DecRef(ctx)
}
// accessorContext implements context.Context by extending an existing
@@ -136,6 +151,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
// CreateDeviceFile creates a device special file at the given pathname in the
// devtmpfs instance accessed by the Accessor.
func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+ actx := a.wrapContext(ctx)
+
mode := (linux.FileMode)(perms)
switch kind {
case vfs.BlockDevice:
@@ -145,12 +162,22 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
default:
panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
}
+
+ // Create any parent directories. See
+ // devtmpfs.c:handle_create()=>path_create().
+ parent := path.Dir(pathname)
+ if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
+ }
+
// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
// create, which it recognizes by storing a pointer to the kdevtmpfs struct
// thread in struct inode::i_private. Accessor doesn't yet support deletion
// of files at all, and probably won't as long as we don't need to support
// kernel modules, so this is moot for now.
- return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+ return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
Mode: mode,
DevMajor: major,
DevMinor: minor,
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index b6d52c015..e058eda7a 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
package devtmpfs
import (
+ "path"
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,12 +27,15 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+ t.Helper()
+
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
-
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Register tmpfs just so that we can have a root filesystem that isn't
@@ -43,14 +48,12 @@ func TestDevtmpfs(t *testing.T) {
})
// Create a test mount namespace with devtmpfs mounted at "/dev".
- const devPath = "/dev"
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
root := mntns.Root()
- defer root.DecRef()
+ root.IncRef()
devpop := vfs.PathOperation{
Root: root,
Start: root,
@@ -61,62 +64,167 @@ func TestDevtmpfs(t *testing.T) {
}); err != nil {
t.Fatalf("failed to create mount point: %v", err)
}
- if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+ if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
t.Fatalf("failed to mount devtmpfs: %v", err)
}
+ return ctx, creds, vfsObj, root, func() {
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
+ }
+}
+
+func TestUserspaceInit(t *testing.T) {
+ ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+ defer cleanup()
+
a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
if err != nil {
t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
}
- defer a.Release()
+ defer a.Release(ctx)
// Create "userspace-initialized" files using a devtmpfs.Accessor.
if err := a.UserspaceInit(ctx); err != nil {
t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
}
+
// Created files should be visible in the test mount namespace.
- abspath := devPath + "/fd"
- target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(abspath),
- })
- if want := "/proc/self/fd"; err != nil || target != want {
- t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+ links := []struct {
+ source string
+ target string
+ }{
+ {
+ source: "fd",
+ target: "/proc/self/fd",
+ },
+ {
+ source: "stdin",
+ target: "/proc/self/fd/0",
+ },
+ {
+ source: "stdout",
+ target: "/proc/self/fd/1",
+ },
+ {
+ source: "stderr",
+ target: "/proc/self/fd/2",
+ },
+ {
+ source: "ptmx",
+ target: "pts/ptmx",
+ },
}
- // Create a dummy device special file using a devtmpfs.Accessor.
- const (
- pathInDev = "dummy"
- kind = vfs.CharDevice
- major = 12
- minor = 34
- perms = 0600
- wantMode = linux.S_IFCHR | perms
- )
- if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
- t.Fatalf("failed to create device file: %v", err)
+ for _, link := range links {
+ abspath := path.Join(devPath, link.source)
+ if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }); err != nil || gotTarget != link.target {
+ t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+ }
}
- // The device special file should be visible in the test mount namespace.
- abspath = devPath + "/" + pathInDev
- stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(abspath),
- }, &vfs.StatOptions{
- Mask: linux.STATX_TYPE | linux.STATX_MODE,
- })
- if err != nil {
- t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+ dirs := []string{"shm", "pts"}
+ for _, dir := range dirs {
+ abspath := path.Join(devPath, dir)
+ statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_MODE,
+ })
+ if err != nil {
+ t.Errorf("stat(%q): got error %v ", abspath, err)
+ continue
+ }
+ if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+ t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+ }
}
- if stat.Mode != wantMode {
- t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+ ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+ defer cleanup()
+
+ a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+ if err != nil {
+ t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
}
- if stat.RdevMajor != major {
- t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+ defer a.Release(ctx)
+
+ devFiles := []struct {
+ path string
+ kind vfs.DeviceKind
+ major uint32
+ minor uint32
+ perms uint16
+ }{
+ {
+ path: "dummy",
+ kind: vfs.CharDevice,
+ major: 12,
+ minor: 34,
+ perms: 0600,
+ },
+ {
+ path: "foo/bar",
+ kind: vfs.BlockDevice,
+ major: 13,
+ minor: 35,
+ perms: 0660,
+ },
+ {
+ path: "foo/baz",
+ kind: vfs.CharDevice,
+ major: 12,
+ minor: 40,
+ perms: 0666,
+ },
+ {
+ path: "a/b/c/d/e",
+ kind: vfs.BlockDevice,
+ major: 12,
+ minor: 34,
+ perms: 0600,
+ },
}
- if stat.RdevMinor != minor {
- t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+ for _, f := range devFiles {
+ if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+ t.Fatalf("failed to create device file: %v", err)
+ }
+ // The device special file should be visible in the test mount namespace.
+ abspath := path.Join(devPath, f.path)
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_TYPE | linux.STATX_MODE,
+ })
+ if err != nil {
+ t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+ }
+ if stat.RdevMajor != f.major {
+ t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+ }
+ if stat.RdevMinor != f.minor {
+ t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+ }
+ wantMode := f.perms
+ switch f.kind {
+ case vfs.CharDevice:
+ wantMode |= linux.S_IFCHR
+ case vfs.BlockDevice:
+ wantMode |= linux.S_IFBLK
+ }
+ if stat.Mode != wantMode {
+ t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+ }
}
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/save_restore.go b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
new file mode 100644
index 000000000..28832d850
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devtmpfs
+
+// afterLoad is invoked by stateify.
+func (fst *FilesystemType) afterLoad() {
+ if fst.fs != nil {
+ // Ensure that we don't create another filesystem.
+ fst.initOnce.Do(func() {})
+ }
+}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index c573d7935..5b29f2358 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,17 +30,20 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
type EventFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// queue is used to notify interested parties when the event object
// becomes readable or writable.
- queue waiter.Queue `state:"zerovalue"`
+ queue waiter.Queue
// mu protects the fields below.
mu sync.Mutex `state:"nosave"`
@@ -58,9 +61,9 @@ type EventFileDescription struct {
var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)
// New creates a new event fd.
-func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
efd := &EventFileDescription{
val: initVal,
semMode: semMode,
@@ -105,8 +108,8 @@ func (efd *EventFileDescription) HostFD() (int, error) {
return efd.hostfd, nil
}
-// Release implements FileDescriptionImpl.Release()
-func (efd *EventFileDescription) Release() {
+// Release implements vfs.FileDescriptionImpl.Release.
+func (efd *EventFileDescription) Release(context.Context) {
efd.mu.Lock()
defer efd.mu.Unlock()
if efd.hostfd >= 0 {
@@ -118,7 +121,7 @@ func (efd *EventFileDescription) Release() {
}
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
if dst.NumBytes() < 8 {
return 0, syscall.EINVAL
@@ -129,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
return 8, nil
}
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
if src.NumBytes() < 8 {
return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
index 20e3adffc..49916fa81 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
@@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) {
for _, initVal := range initVals {
ctx := contexttest.Context(t)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Make a new eventfd that is writable.
- eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR)
+ eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR)
if err != nil {
t.Fatalf("New() failed: %v", err)
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(ctx)
// Register a callback for a write event.
w, ch := waiter.NewChannelEntry(nil)
@@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) {
func TestEventFDStat(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Make a new eventfd that is writable.
- eventfd, err := New(vfsObj, 0, false, linux.O_RDWR)
+ eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR)
if err != nil {
t.Fatalf("New() failed: %v", err)
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(ctx)
statx, err := eventfd.Stat(ctx, vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index ff861d0fe..7b1eec3da 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -51,9 +51,12 @@ go_library(
"//pkg/fd",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/ext/disklayout",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
@@ -85,9 +88,9 @@ go_test(
library = ":ext",
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/marshal/primitive",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/ext/disklayout",
"//pkg/sentry/kernel/auth",
@@ -95,7 +98,7 @@ go_test(
"//pkg/syserror",
"//pkg/test/testutil",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
- "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 89caee3df..2ee7cc7ac 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -53,22 +53,27 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
return nil, nil, nil, nil, err
}
vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
}
root := mntns.Root()
+ root.IncRef()
tearDown := func() {
- root.DecRef()
+ root.DecRef(ctx)
if err := f.Close(); err != nil {
b.Fatalf("tearDown failed: %v", err)
@@ -90,7 +95,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
ctx := contexttest.Context(b)
creds := auth.CredentialsFromContext(ctx)
- if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+ if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
GetFilesystemOptions: vfs.GetFilesystemOptions{
InternalData: int(f.Fd()),
},
@@ -169,7 +174,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
// Create extfs submount.
mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop)
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index a2d8c3ad6..1165234f9 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -18,7 +18,7 @@ import (
"io"
"math"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -34,19 +34,19 @@ type blockMapFile struct {
// directBlks are the direct blocks numbers. The physical blocks pointed by
// these holds file data. Contains file blocks 0 to 11.
- directBlks [numDirectBlks]uint32
+ directBlks [numDirectBlks]primitive.Uint32
// indirectBlk is the physical block which contains (blkSize/4) direct block
// numbers (as uint32 integers).
- indirectBlk uint32
+ indirectBlk primitive.Uint32
// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
// block numbers (as uint32 integers).
- doubleIndirectBlk uint32
+ doubleIndirectBlk primitive.Uint32
// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
// indirect block numbers (as uint32 integers).
- tripleIndirectBlk uint32
+ tripleIndirectBlk primitive.Uint32
// coverage at (i)th index indicates the amount of file data a node at
// height (i) covers. Height 0 is the direct block.
@@ -58,19 +58,22 @@ var _ io.ReaderAt = (*blockMapFile)(nil)
// newBlockMapFile is the blockMapFile constructor. It initializes the file to
// physical blocks map with (at most) the first 12 (direct) blocks.
-func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
- file := &blockMapFile{regFile: regFile}
+func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
+ file := &blockMapFile{}
file.regFile.impl = file
+ file.regFile.inode.init(args, &file.regFile)
for i := uint(0); i < 4; i++ {
- file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+ file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i)
}
- blkMap := regFile.inode.diskInode.Data()
- binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
- binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
- binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
- binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+ blkMap := file.regFile.inode.diskInode.Data()
+ for i := 0; i < numDirectBlks; i++ {
+ file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4])
+ }
+ file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4])
+ file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4])
+ file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4])
return file, nil
}
@@ -116,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
switch {
case offset < dirBlksEnd:
// Direct block.
- curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+ curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:])
case offset < indirBlkEnd:
// Indirect block.
- curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+ curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:])
case offset < doubIndirBlkEnd:
// Doubly indirect block.
- curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+ curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:])
default:
// Triply indirect block.
- curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+ curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:])
}
read += curR
@@ -173,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
read := 0
curChildOff := relFileOff % childCov
for i := startIdx; i < endIdx; i++ {
- var childPhyBlk uint32
+ var childPhyBlk primitive.Uint32
err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
if err != nil {
return read, err
}
- n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+ n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:])
read += n
if err != nil {
return read, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 181727ef7..ed98b482e 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -20,7 +20,7 @@ import (
"testing"
"github.com/google/go-cmp/cmp"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
)
@@ -85,49 +85,50 @@ func (n *blkNumGen) next() uint32 {
// the inode covers and that is written to disk.
func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
mockDisk := make([]byte, mockBMDiskSize)
- regFile := regularFile{
- inode: inode{
- fs: &filesystem{
- dev: bytes.NewReader(mockDisk),
- },
- diskInode: &disklayout.InodeNew{
- InodeOld: disklayout.InodeOld{
- SizeLo: getMockBMFileFize(),
- },
- },
- blkSize: uint64(mockBMBlkSize),
- },
- }
-
var fileData []byte
blkNums := newBlkNumGen()
- var data []byte
+ off := 0
+ data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes())
// Write the direct blocks.
for i := 0; i < numDirectBlks; i++ {
- curBlkNum := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+ curBlkNum := primitive.Uint32(blkNums.next())
+ curBlkNum.MarshalBytes(data[off:])
+ off += curBlkNum.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...)
}
// Write to indirect block.
- indirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
-
- // Write to indirect block.
- doublyIndirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
-
- // Write to indirect block.
- triplyIndirectBlk := blkNums.next()
- data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
- fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
-
- copy(regFile.inode.diskInode.Data(), data)
+ indirectBlk := primitive.Uint32(blkNums.next())
+ indirectBlk.MarshalBytes(data[off:])
+ off += indirectBlk.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...)
+
+ // Write to double indirect block.
+ doublyIndirectBlk := primitive.Uint32(blkNums.next())
+ doublyIndirectBlk.MarshalBytes(data[off:])
+ off += doublyIndirectBlk.SizeBytes()
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...)
+
+ // Write to triple indirect block.
+ triplyIndirectBlk := primitive.Uint32(blkNums.next())
+ triplyIndirectBlk.MarshalBytes(data[off:])
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...)
+
+ args := inodeArgs{
+ fs: &filesystem{
+ dev: bytes.NewReader(mockDisk),
+ },
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: getMockBMFileFize(),
+ },
+ },
+ blkSize: uint64(mockBMBlkSize),
+ }
+ copy(args.diskInode.Data(), data)
- mockFile, err := newBlockMapFile(regFile)
+ mockFile, err := newBlockMapFile(args)
if err != nil {
t.Fatalf("newBlockMapFile failed: %v", err)
}
@@ -145,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN
var fileData []byte
for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
- curBlkNum := blkNums.next()
- copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
- fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+ curBlkNum := primitive.Uint32(blkNums.next())
+ curBlkNum.MarshalBytes(disk[off : off+4])
+ fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...)
}
return fileData
}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 6bd1a9fc6..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -15,10 +15,13 @@
package ext
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -55,7 +58,7 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
+func (d *dentry) DecRef(ctx context.Context) {
// FIXME(b/134676337): filesystem.mu may not be locked as required by
// inode.decRef().
d.inode.decRef()
@@ -63,12 +66,17 @@ func (d *dentry) DecRef() {
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
// Watches implements vfs.DentryImpl.Watches.
//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
+// TODO(b/134676337): Implement inotify.
func (d *dentry) Watches() *vfs.Watches {
return nil
}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) OnZeroWatches(context.Context) {}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 12b875c8f..0ad79b381 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -16,10 +16,10 @@ package ext
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -27,6 +27,8 @@ import (
)
// directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
type directory struct {
inode inode
@@ -38,7 +40,7 @@ type directory struct {
// Lock Order (outermost locks must be taken first):
// directory.mu
// filesystem.mu
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// childList is a list containing (1) child dirents and (2) fake dirents
// (with diskDirent == nil) that represent the iteration position of
@@ -54,16 +56,15 @@ type directory struct {
}
// newDirectory is the directory constructor.
-func newDirectory(inode inode, newDirent bool) (*directory, error) {
+func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
file := &directory{
- inode: inode,
childCache: make(map[string]*dentry),
childMap: make(map[string]*dirent),
}
- file.inode.impl = file
+ file.inode.init(args, file)
// Initialize childList by reading dirents from the underlying file.
- if inode.diskInode.Flags().Index {
+ if args.diskInode.Flags().Index {
// TODO(b/134676337): Support hash tree directories. Currently only the '.'
// and '..' entries are read in.
@@ -74,7 +75,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
// The dirents are organized in a linear array in the file data.
// Extract the file data and decode the dirents.
- regFile, err := newRegularFile(inode)
+ regFile, err := newRegularFile(args)
if err != nil {
return nil, err
}
@@ -82,7 +83,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
// buf is used as scratch space for reading in dirents from disk and
// unmarshalling them into dirent structs.
buf := make([]byte, disklayout.DirentSize)
- size := inode.diskInode.Size()
+ size := args.diskInode.Size()
for off, inc := uint64(0), uint64(0); off < size; off += inc {
toRead := size - off
if toRead > disklayout.DirentSize {
@@ -98,7 +99,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
} else {
curDirent.diskDirent = &disklayout.DirentOld{}
}
- binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+ curDirent.diskDirent.UnmarshalBytes(buf)
if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
// Inode number and name length fields being set to 0 is used to indicate
@@ -120,6 +121,8 @@ func (i *inode) isDir() bool {
}
// dirent is the directory.childList node.
+//
+// +stateify savable
type dirent struct {
diskDirent disklayout.Dirent
@@ -129,6 +132,8 @@ type dirent struct {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
@@ -142,7 +147,7 @@ type directoryFD struct {
var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
if fd.iter == nil {
return
}
@@ -306,3 +311,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
fd.off = offset
return offset, nil
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 9bd9c76c0..d98a05dd8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,10 +22,11 @@ go_library(
"superblock_old.go",
"test_utils.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
+ "//pkg/marshal",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..0d56ae9da 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
// BlockGroup represents a Linux ext block group descriptor. An ext file system
// is split into a series of block groups. This provides an access layer to
// information needed to access and use a block group.
@@ -30,6 +34,8 @@ package disklayout
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
type BlockGroup interface {
+ marshal.Marshallable
+
// InodeTable returns the absolute block number of the block containing the
// inode table. This points to an array of Inode structs. Inode tables are
// statically allocated at mkfs time. The superblock records the number of
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..a35fa22a0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
@@ -17,6 +17,8 @@ package disklayout
// BlockGroup32Bit emulates the first half of struct ext4_group_desc in
// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
// 32-bit ext4 filesystems. It implements BlockGroup interface.
+//
+// +marshal
type BlockGroup32Bit struct {
BlockBitmapLo uint32
InodeBitmapLo uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..d54d1d345 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
@@ -18,6 +18,8 @@ package disklayout
// It is the block group descriptor struct for 64-bit ext4 filesystems.
// It implements BlockGroup interface. It is an extension of the 32-bit
// version of BlockGroup.
+//
+// +marshal
type BlockGroup64Bit struct {
// We embed the 32-bit struct here because 64-bit version is just an extension
// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..e4ce484e4 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
@@ -21,6 +21,8 @@ import (
// TestBlockGroupSize tests that the block group descriptor structs are of the
// correct size.
func TestBlockGroupSize(t *testing.T) {
- assertSize(t, BlockGroup32Bit{}, 32)
- assertSize(t, BlockGroup64Bit{}, 64)
+ var bgSmall BlockGroup32Bit
+ assertSize(t, &bgSmall, 32)
+ var bgBig BlockGroup64Bit
+ assertSize(t, &bgBig, 64)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 417b6cf65..568c8cb4c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -15,6 +15,7 @@
package disklayout
import (
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fs"
)
@@ -51,6 +52,8 @@ var (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
type Dirent interface {
+ marshal.Marshallable
+
// Inode returns the absolute inode number of the underlying inode.
// Inode number 0 signifies an unused dirent.
Inode() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..51f9c2946 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
@@ -29,12 +29,14 @@ import (
// Note: This struct can be of variable size on disk. The one described below
// is of maximum size and the FileName beyond NameLength bytes might contain
// garbage.
+//
+// +marshal
type DirentNew struct {
InodeNumber uint32
RecordLength uint16
NameLength uint8
FileTypeRaw uint8
- FileNameRaw [MaxFileName]byte
+ FileNameRaw [MaxFileName]byte `marshal:"unaligned"`
}
// Compiles only if DirentNew implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..d4b19e086 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
@@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs"
// Note: This struct can be of variable size on disk. The one described below
// is of maximum size and the FileName beyond NameLength bytes might contain
// garbage.
+//
+// +marshal
type DirentOld struct {
InodeNumber uint32
RecordLength uint16
NameLength uint16
- FileNameRaw [MaxFileName]byte
+ FileNameRaw [MaxFileName]byte `marshal:"unaligned"`
}
// Compiles only if DirentOld implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index 934919f8a..3486864dc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,6 +21,8 @@ import (
// TestDirentSize tests that the dirent structs are of the correct
// size.
func TestDirentSize(t *testing.T) {
- assertSize(t, DirentOld{}, uintptr(DirentSize))
- assertSize(t, DirentNew{}, uintptr(DirentSize))
+ var dOld DirentOld
+ assertSize(t, &dOld, DirentSize)
+ var dNew DirentNew
+ assertSize(t, &dNew, DirentSize)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..0834e9ba8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
@@ -36,8 +36,6 @@
// escape analysis on an unknown implementation at compile time.
//
// Notes:
-// - All fields in these structs are exported because binary.Read would
-// panic otherwise.
// - All structures on disk are in little-endian order. Only jbd2 (journal)
// structures are in big-endian order.
// - All OS dependent fields in these structures will be interpretted using
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 4110649ab..b13999bfc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
// Extents were introduced in ext4 and provide huge performance gains in terms
// data locality and reduced metadata block usage. Extents are organized in
// extent trees. The root node is contained in inode.BlocksRaw.
@@ -64,6 +68,8 @@ type ExtentNode struct {
// ExtentEntry represents an extent tree node entry. The entry can either be
// an ExtentIdx or Extent itself. This exists to simplify navigation logic.
type ExtentEntry interface {
+ marshal.Marshallable
+
// FileBlock returns the first file block number covered by this entry.
FileBlock() uint32
@@ -75,6 +81,8 @@ type ExtentEntry interface {
// tree node begins with this and is followed by `NumEntries` number of:
// - Extent if `Depth` == 0
// - ExtentIdx otherwise
+//
+// +marshal
type ExtentHeader struct {
// Magic in the extent magic number, must be 0xf30a.
Magic uint16
@@ -96,6 +104,8 @@ type ExtentHeader struct {
// internal nodes. Sorted in ascending order based on FirstFileBlock since
// Linux does a binary search on this. This points to a block containing the
// child node.
+//
+// +marshal
type ExtentIdx struct {
FirstFileBlock uint32
ChildBlockLo uint32
@@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 {
// nodes. Sorted in ascending order based on FirstFileBlock since Linux does a
// binary search on this. This points to an array of data blocks containing the
// file data. It covers `Length` data blocks starting from `StartBlock`.
+//
+// +marshal
type Extent struct {
FirstFileBlock uint32
Length uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index 8762b90db..c96002e19 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,10 @@ import (
// TestExtentSize tests that the extent structs are of the correct
// size.
func TestExtentSize(t *testing.T) {
- assertSize(t, ExtentHeader{}, ExtentHeaderSize)
- assertSize(t, ExtentIdx{}, ExtentEntrySize)
- assertSize(t, Extent{}, ExtentEntrySize)
+ var h ExtentHeader
+ assertSize(t, &h, ExtentHeaderSize)
+ var i ExtentIdx
+ assertSize(t, &i, ExtentEntrySize)
+ var e Extent
+ assertSize(t, &e, ExtentEntrySize)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..ef25040a9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
@@ -16,6 +16,7 @@ package disklayout
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
)
@@ -38,6 +39,8 @@ const (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
type Inode interface {
+ marshal.Marshallable
+
// Mode returns the linux file mode which is majorly used to extract
// information like:
// - File permissions (read/write/execute by user/group/others).
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..a4503f5cf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
@@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
// are used to provide nanoscond precision. Hence, these timestamps will now
// overflow in May 2446.
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+//
+// +marshal
type InodeNew struct {
InodeOld
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..e6b28babf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
@@ -30,6 +30,8 @@ const (
//
// All fields representing time are in seconds since the epoch. Which means that
// they will overflow in January 2038.
+//
+// +marshal
type InodeOld struct {
ModeRaw uint16
UIDLo uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..90744e956 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
@@ -24,10 +24,12 @@ import (
// TestInodeSize tests that the inode structs are of the correct size.
func TestInodeSize(t *testing.T) {
- assertSize(t, InodeOld{}, OldInodeSize)
+ var iOld InodeOld
+ assertSize(t, &iOld, OldInodeSize)
// This was updated from 156 bytes to 160 bytes in Oct 2015.
- assertSize(t, InodeNew{}, 160)
+ var iNew InodeNew
+ assertSize(t, &iNew, 160)
}
// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 8bb327006..70948ebe9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -14,6 +14,10 @@
package disklayout
+import (
+ "gvisor.dev/gvisor/pkg/marshal"
+)
+
const (
// SbOffset is the absolute offset at which the superblock is placed.
SbOffset = 1024
@@ -38,6 +42,8 @@ const (
//
// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
type SuperBlock interface {
+ marshal.Marshallable
+
// InodesCount returns the total number of inodes in this filesystem.
InodesCount() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..4dc6080fb 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
@@ -17,6 +17,8 @@ package disklayout
// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
// RevLevel = DynamicRev and 64-bit feature is disabled.
+//
+// +marshal
type SuperBlock32Bit struct {
// We embed the old superblock struct here because the 32-bit version is just
// an extension of the old version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..2c9039327 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
@@ -19,6 +19,8 @@ package disklayout
// 1024 bytes (smallest possible block size) and hence the superblock always
// fits in no more than one data block. Should only be used when the 64-bit
// feature is set.
+//
+// +marshal
type SuperBlock64Bit struct {
// We embed the 32-bit struct here because 64-bit version is just an extension
// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..e4709f23c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
@@ -16,6 +16,8 @@ package disklayout
// SuperBlockOld implements SuperBlock and represents the old version of the
// superblock struct. Should be used only if RevLevel = OldRev.
+//
+// +marshal
type SuperBlockOld struct {
InodesCountRaw uint32
BlocksCountLo uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..b734b6987 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
@@ -21,7 +21,10 @@ import (
// TestSuperBlockSize tests that the superblock structs are of the correct
// size.
func TestSuperBlockSize(t *testing.T) {
- assertSize(t, SuperBlockOld{}, 84)
- assertSize(t, SuperBlock32Bit{}, 336)
- assertSize(t, SuperBlock64Bit{}, 1024)
+ var sbOld SuperBlockOld
+ assertSize(t, &sbOld, 84)
+ var sb32 SuperBlock32Bit
+ assertSize(t, &sb32, 336)
+ var sb64 SuperBlock64Bit
+ assertSize(t, &sb64, 1024)
}
diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..a4bc08411 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
@@ -18,13 +18,13 @@ import (
"reflect"
"testing"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
)
-func assertSize(t *testing.T, v interface{}, want uintptr) {
+func assertSize(t *testing.T, v marshal.Marshallable, want int) {
t.Helper()
- if got := binary.Size(v); got != want {
+ if got := v.SizeBytes(); got != want {
t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
}
}
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index dac6effbf..38fb7962b 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,11 +34,10 @@ import (
const Name = "ext"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
-// Compiles only if FilesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// getDeviceFd returns an io.ReaderAt to the underlying device.
// Currently there are two ways of mounting an ext(2/3/4) fs:
// 1. Specify a mount with our internal special MountType in the OCI spec.
@@ -99,6 +98,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
@@ -123,32 +125,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs.vfsfs.Init(vfsObj, &fsType, &fs)
fs.sb, err = readSuperBlock(dev)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
// mount(2) specifies that EINVAL should be returned if the superblock is
// invalid.
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
}
// Refuse to mount if the filesystem is incompatible.
if !isCompatible(fs.sb) {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
}
fs.bgs, err = readBlockGroups(dev, fs.sb)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
rootInode.incRef()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 64e9a579f..d9fd4590c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -65,22 +65,27 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
}
root := mntns.Root()
+ root.IncRef()
tearDown := func() {
- root.DecRef()
+ root.DecRef(ctx)
if err := f.Close(); err != nil {
t.Fatalf("tearDown failed: %v", err)
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 11dcc0346..778460107 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -18,12 +18,13 @@ import (
"io"
"sort"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
type extentFile struct {
regFile regularFile
@@ -38,9 +39,10 @@ var _ io.ReaderAt = (*extentFile)(nil)
// newExtentFile is the extent file constructor. It reads the entire extent
// tree into memory.
// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(regFile regularFile) (*extentFile, error) {
- file := &extentFile{regFile: regFile}
+func newExtentFile(args inodeArgs) (*extentFile, error) {
+ file := &extentFile{}
file.regFile.impl = file
+ file.regFile.inode.init(args, &file.regFile)
err := file.buildExtTree()
if err != nil {
return nil, err
@@ -57,7 +59,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) {
func (f *extentFile) buildExtTree() error {
rootNodeData := f.regFile.inode.diskInode.Data()
- binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
+ f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize])
// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
if f.root.Header.NumEntries > 4 {
@@ -76,7 +78,7 @@ func (f *extentFile) buildExtTree() error {
// Internal node.
curEntry = &disklayout.ExtentIdx{}
}
- binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
+ curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize])
f.root.Entries[i].Entry = curEntry
}
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index a2382daa3..985f76ac0 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -21,7 +21,6 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
)
@@ -177,21 +176,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
t.Helper()
mockDisk := make([]byte, mockExtentBlkSize*10)
- mockExtentFile := &extentFile{
- regFile: regularFile{
- inode: inode{
- fs: &filesystem{
- dev: bytes.NewReader(mockDisk),
- },
- diskInode: &disklayout.InodeNew{
- InodeOld: disklayout.InodeOld{
- SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
- },
- },
- blkSize: mockExtentBlkSize,
+ mockExtentFile := &extentFile{}
+ args := inodeArgs{
+ fs: &filesystem{
+ dev: bytes.NewReader(mockDisk),
+ },
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
},
},
+ blkSize: mockExtentBlkSize,
}
+ mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile)
fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
@@ -204,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
// writeTree writes the tree represented by `root` to the inode and disk. It
// also writes random file data on disk.
func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
- rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+ rootData := in.diskInode.Data()
+ root.Header.MarshalBytes(rootData)
+ off := root.Header.SizeBytes()
for _, ep := range root.Entries {
- rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+ ep.Entry.MarshalBytes(rootData[off:])
+ off += ep.Entry.SizeBytes()
}
- copy(in.diskInode.Data(), rootData)
-
var fileData []byte
for _, ep := range root.Entries {
if root.Header.Height == 0 {
@@ -225,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl
// writeTreeToDisk is the recursive step for writeTree which writes the tree
// on the disk only. Also writes random file data on disk.
func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
- nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+ nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:]
+ curNode.Node.Header.MarshalBytes(nodeData)
+ off := curNode.Node.Header.SizeBytes()
for _, ep := range curNode.Node.Entries {
- nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+ ep.Entry.MarshalBytes(nodeData[off:])
+ off += ep.Entry.SizeBytes()
}
- copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
-
var fileData []byte
for _, ep := range curNode.Node.Entries {
if curNode.Node.Header.Height == 0 {
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 92f7da40d..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,6 +26,7 @@ import (
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
}
func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 557963e03..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
)
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// dev represents the underlying fs device. It does not require protection
// because io.ReaderAt permits concurrent read calls to it. It translates to
@@ -81,10 +83,10 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
-// - !rp.Done().
-// - inode == vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
}
@@ -100,7 +102,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
}
d := vfsd.Impl().(*dentry)
if name == ".." {
- isRoot, err := rp.CheckRoot(vfsd)
+ isRoot, err := rp.CheckRoot(ctx, vfsd)
if err != nil {
return nil, nil, err
}
@@ -108,7 +110,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
rp.Advance()
return vfsd, inode, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, nil, err
}
rp.Advance()
@@ -143,7 +145,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
child.name = name
dir.childCache[name] = child
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, nil, err
}
if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
@@ -166,13 +168,13 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
-func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+// * filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
for !rp.Done() {
var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
if err != nil {
return nil, nil, err
}
@@ -194,14 +196,14 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error)
// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
-// - !rp.Done().
-func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
for !rp.Final() {
var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
if err != nil {
return nil, nil, err
}
@@ -216,7 +218,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e
// the rp till the parent of the last component which should be an existing
// directory. If parent is false then resolves rp entirely. Attemps to resolve
// the path as far as it can with a read lock and upgrades the lock if needed.
-func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
var (
vfsd *vfs.Dentry
inode *inode
@@ -227,9 +229,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
// of disk. This reduces congestion (allows concurrent walks).
fs.mu.RLock()
if parent {
- vfsd, inode, err = walkParentLocked(rp, false)
+ vfsd, inode, err = walkParentLocked(ctx, rp, false)
} else {
- vfsd, inode, err = walkLocked(rp, false)
+ vfsd, inode, err = walkLocked(ctx, rp, false)
}
fs.mu.RUnlock()
@@ -238,9 +240,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
// walk is fine as this is a read only filesystem.
fs.mu.Lock()
if parent {
- vfsd, inode, err = walkParentLocked(rp, true)
+ vfsd, inode, err = walkParentLocked(ctx, rp, true)
} else {
- vfsd, inode, err = walkLocked(rp, true)
+ vfsd, inode, err = walkLocked(ctx, rp, true)
}
fs.mu.Unlock()
}
@@ -283,7 +285,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) {
// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -292,7 +294,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
- vfsd, inode, err := fs.walk(rp, false)
+ vfsd, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -312,7 +314,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
- vfsd, inode, err := fs.walk(rp, true)
+ vfsd, inode, err := fs.walk(ctx, rp, true)
if err != nil {
return nil, err
}
@@ -322,7 +324,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- vfsd, inode, err := fs.walk(rp, false)
+ vfsd, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -336,7 +338,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
}
@@ -349,7 +351,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return linux.Statx{}, err
}
@@ -360,7 +362,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
- if _, _, err := fs.walk(rp, false); err != nil {
+ if _, _, err := fs.walk(ctx, rp, false); err != nil {
return linux.Statfs{}, err
}
@@ -370,7 +372,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}
@@ -390,7 +392,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EEXIST
}
- if _, _, err := fs.walk(rp, true); err != nil {
+ if _, _, err := fs.walk(ctx, rp, true); err != nil {
return err
}
@@ -403,7 +405,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
- if _, _, err := fs.walk(rp, true); err != nil {
+ if _, _, err := fs.walk(ctx, rp, true); err != nil {
return err
}
@@ -416,7 +418,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
- _, _, err := fs.walk(rp, true)
+ _, _, err := fs.walk(ctx, rp, true)
if err != nil {
return err
}
@@ -430,7 +432,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return syserror.ENOENT
}
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -440,7 +442,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -454,7 +456,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -468,7 +470,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return syserror.EEXIST
}
- _, _, err := fs.walk(rp, true)
+ _, _, err := fs.walk(ctx, rp, true)
if err != nil {
return err
}
@@ -478,7 +480,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -490,9 +492,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -504,36 +506,36 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
- _, _, err := fs.walk(rp, false)
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
- _, _, err := fs.walk(rp, false)
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
}
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
- _, _, err := fs.walk(rp, false)
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
- _, _, err := fs.walk(rp, false)
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 485f86f4b..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
// |-- regular--
// |-- extent file
// |-- block map file
+//
+// +stateify savable
type inode struct {
// refs is a reference count. refs is accessed using atomic memory operations.
refs int64
@@ -54,6 +56,8 @@ type inode struct {
// diskInode gives us access to the inode struct on disk. Immutable.
diskInode disklayout.Inode
+ locks vfs.FileLocks
+
// This is immutable. The first field of the implementations must have inode
// as the first field to ensure temporality.
impl interface{}
@@ -115,7 +119,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
}
// Build the inode based on its type.
- inode := inode{
+ args := inodeArgs{
fs: fs,
inodeNum: inodeNum,
blkSize: blkSize,
@@ -124,19 +128,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
switch diskInode.Mode().FileType() {
case linux.ModeSymlink:
- f, err := newSymlink(inode)
+ f, err := newSymlink(args)
if err != nil {
return nil, err
}
return &f.inode, nil
case linux.ModeRegular:
- f, err := newRegularFile(inode)
+ f, err := newRegularFile(args)
if err != nil {
return nil, err
}
return &f.inode, nil
case linux.ModeDirectory:
- f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+ f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType)
if err != nil {
return nil, err
}
@@ -147,6 +151,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
}
}
+type inodeArgs struct {
+ fs *filesystem
+ inodeNum uint32
+ blkSize uint64
+ diskInode disklayout.Inode
+}
+
+func (in *inode) init(args inodeArgs, impl interface{}) {
+ in.fs = args.fs
+ in.inodeNum = args.inodeNum
+ in.blkSize = args.blkSize
+ in.diskInode = args.diskInode
+ in.impl = impl
+}
+
// open creates and returns a file description for the dentry passed in.
func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
@@ -157,6 +176,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
switch in.impl.(type) {
case *regularFile:
var fd regularFileFD
+ fd.LockFD.Init(&in.locks)
if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -168,6 +188,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
return nil, syserror.EISDIR
}
var fd directoryFD
+ fd.LockFD.Init(&in.locks)
if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -178,6 +199,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
return nil, syserror.ELOOP
}
var fd symlinkFD
+ fd.LockFD.Init(&in.locks)
fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
default:
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index 30135ddb0..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -30,6 +31,8 @@ import (
// regularFile represents a regular file's inode. This too follows the
// inheritance pattern prevelant in the vfs layer described in
// pkg/sentry/vfs/README.md.
+//
+// +stateify savable
type regularFile struct {
inode inode
@@ -43,28 +46,19 @@ type regularFile struct {
// newRegularFile is the regularFile constructor. It figures out what kind of
// file this is and initializes the fileReader.
-func newRegularFile(inode inode) (*regularFile, error) {
- regFile := regularFile{
- inode: inode,
- }
-
- inodeFlags := inode.diskInode.Flags()
-
- if inodeFlags.Extents {
- file, err := newExtentFile(regFile)
+func newRegularFile(args inodeArgs) (*regularFile, error) {
+ if args.diskInode.Flags().Extents {
+ file, err := newExtentFile(args)
if err != nil {
return nil, err
}
-
- file.regFile.inode.impl = &file.regFile
return &file.regFile, nil
}
- file, err := newBlockMapFile(regFile)
+ file, err := newBlockMapFile(args)
if err != nil {
return nil, err
}
- file.regFile.inode.impl = &file.regFile
return &file.regFile, nil
}
@@ -75,18 +69,21 @@ func (in *inode) isRegular() bool {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type regularFileFD struct {
fileDescription
+ vfs.LockFD
// off is the file offset. off is accessed using atomic memory operations.
off int64
// offMu serializes operations that may mutate off.
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {}
+func (fd *regularFileFD) Release(context.Context) {}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
@@ -157,3 +154,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
// TODO(b/134676337): Implement mmap(2).
return syserror.ENODEV
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 1447a4dc1..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
)
// symlink represents a symlink inode.
+//
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
@@ -30,18 +32,17 @@ type symlink struct {
// newSymlink is the symlink constructor. It reads out the symlink target from
// the inode (however it might have been stored).
-func newSymlink(inode inode) (*symlink, error) {
- var file *symlink
+func newSymlink(args inodeArgs) (*symlink, error) {
var link []byte
// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
// Otherwise either extents or block maps will be used to store the link.
- size := inode.diskInode.Size()
+ size := args.diskInode.Size()
if size < 60 {
- link = inode.diskInode.Data()[:size]
+ link = args.diskInode.Data()[:size]
} else {
// Create a regular file out of this inode and read out the target.
- regFile, err := newRegularFile(inode)
+ regFile, err := newRegularFile(args)
if err != nil {
return nil, err
}
@@ -52,8 +53,8 @@ func newSymlink(inode inode) (*symlink, error) {
}
}
- file = &symlink{inode: inode, target: string(link)}
- file.inode.impl = file
+ file := &symlink{target: string(link)}
+ file.inode.init(args, file)
return file, nil
}
@@ -62,18 +63,21 @@ func (in *inode) isSymlink() bool {
return ok
}
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
// vfs.FileDescriptionImpl. which may only be used if open options contains
// O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
type symlinkFD struct {
fileDescription
+ vfs.NoLockFD
}
// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *symlinkFD) Release() {}
+func (fd *symlinkFD) Release(context.Context) {}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index d8b728f8c..58ef7b9b8 100644
--- a/pkg/sentry/fsimpl/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -17,21 +17,21 @@ package ext
import (
"io"
- "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// readFromDisk performs a binary read from disk into the given struct from
// the absolute offset provided.
-func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
- n := binary.Size(v)
+func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error {
+ n := v.SizeBytes()
buf := make([]byte, n)
if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
return syserror.EIO
}
- binary.Unmarshal(buf, binary.LittleEndian, v)
+ v.UnmarshalBytes(buf)
return nil
}
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
new file mode 100644
index 000000000..2158b1bbc
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -0,0 +1,87 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+ name = "request_list",
+ out = "request_list.go",
+ package = "fuse",
+ prefix = "request",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Request",
+ "Linker": "*Request",
+ },
+)
+
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "fuse",
+ prefix = "inode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
+go_library(
+ name = "fuse",
+ srcs = [
+ "connection.go",
+ "connection_control.go",
+ "dev.go",
+ "directory.go",
+ "file.go",
+ "fusefs.go",
+ "inode_refs.go",
+ "read_write.go",
+ "register.go",
+ "regular_file.go",
+ "request_list.go",
+ "request_response.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/log",
+ "//pkg/marshal",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
+ "//pkg/safemem",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/kernfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "fuse_test",
+ size = "small",
+ srcs = [
+ "connection_test.go",
+ "dev_test.go",
+ "utils_test.go",
+ ],
+ library = ":fuse",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/marshal",
+ "//pkg/sentry/fsimpl/testutil",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
new file mode 100644
index 000000000..8ccda1264
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -0,0 +1,322 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ // fuseDefaultMaxBackground is the default value for MaxBackground.
+ fuseDefaultMaxBackground = 12
+
+ // fuseDefaultCongestionThreshold is the default value for CongestionThreshold,
+ // and is 75% of the default maximum of MaxGround.
+ fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4)
+
+ // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq.
+ fuseDefaultMaxPagesPerReq = 32
+)
+
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
+//
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
+//
+// +stateify savable
+type connection struct {
+ fd *DeviceFD
+
+ // mu protects access to struct memebers.
+ mu sync.Mutex `state:"nosave"`
+
+ // attributeVersion is the version of connection's attributes.
+ attributeVersion uint64
+
+ // We target FUSE 7.23.
+ // The following FUSE_INIT flags are currently unsupported by this implementation:
+ // - FUSE_EXPORT_SUPPORT
+ // - FUSE_POSIX_LOCKS: requires POSIX locks
+ // - FUSE_FLOCK_LOCKS: requires POSIX locks
+ // - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+ // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
+ // - FUSE_ASYNC_DIO
+ // - FUSE_PARALLEL_DIROPS (7.25)
+ // - FUSE_HANDLE_KILLPRIV (7.26)
+ // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+ // - FUSE_ABORT_ERROR (7.27)
+ // - FUSE_CACHE_SYMLINKS (7.28)
+ // - FUSE_NO_OPENDIR_SUPPORT (7.29)
+ // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+ // - FUSE_MAP_ALIGNMENT (7.31)
+
+ // initialized after receiving FUSE_INIT reply.
+ // Until it's set, suspend sending FUSE requests.
+ // Use SetInitialized() and IsInitialized() for atomic access.
+ initialized int32
+
+ // initializedChan is used to block requests before initialization.
+ initializedChan chan struct{} `state:".(bool)"`
+
+ // connected (connection established) when a new FUSE file system is created.
+ // Set to false when:
+ // umount,
+ // connection abort,
+ // device release.
+ connected bool
+
+ // connInitError if FUSE_INIT encountered error (major version mismatch).
+ // Only set in INIT.
+ connInitError bool
+
+ // connInitSuccess if FUSE_INIT is successful.
+ // Only set in INIT.
+ // Used for destory (not yet implemented).
+ connInitSuccess bool
+
+ // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+ // Set only if abortErr is true and via fuse control fs (not yet implemented).
+ // TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+ aborted bool
+
+ // numWating is the number of requests waiting to be
+ // sent to FUSE device or being processed by FUSE daemon.
+ numWaiting uint32
+
+ // Terminology note:
+ //
+ // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+ //
+ // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+ //
+ // We call the "background" requests in unix term as async requests.
+ // The "async requests" in unix term is our async requests that expect a reply,
+ // i.e. `!request.noReply`
+
+ // asyncMu protects the async request fields.
+ asyncMu sync.Mutex `state:"nosave"`
+
+ // asyncNum is the number of async requests.
+ // Protected by asyncMu.
+ asyncNum uint16
+
+ // asyncCongestionThreshold the number of async requests.
+ // Negotiated in FUSE_INIT as "CongestionThreshold".
+ // TODO(gvisor.dev/issue/3529): add congestion control.
+ // Protected by asyncMu.
+ asyncCongestionThreshold uint16
+
+ // asyncNumMax is the maximum number of asyncNum.
+ // Connection blocks the async requests when it is reached.
+ // Negotiated in FUSE_INIT as "MaxBackground".
+ // Protected by asyncMu.
+ asyncNumMax uint16
+
+ // maxRead is the maximum size of a read buffer in in bytes.
+ // Initialized from a fuse fs parameter.
+ maxRead uint32
+
+ // maxWrite is the maximum size of a write buffer in bytes.
+ // Negotiated in FUSE_INIT.
+ maxWrite uint32
+
+ // maxPages is the maximum number of pages for a single request to use.
+ // Negotiated in FUSE_INIT.
+ maxPages uint16
+
+ // minor version of the FUSE protocol.
+ // Negotiated and only set in INIT.
+ minor uint32
+
+ // atomicOTrunc is true when FUSE does not send a separate SETATTR request
+ // before open with O_TRUNC flag.
+ // Negotiated and only set in INIT.
+ atomicOTrunc bool
+
+ // asyncRead if read pages asynchronously.
+ // Negotiated and only set in INIT.
+ asyncRead bool
+
+ // writebackCache is true for write-back cache policy,
+ // false for write-through policy.
+ // Negotiated and only set in INIT.
+ writebackCache bool
+
+ // bigWrites if doing multi-page cached writes.
+ // Negotiated and only set in INIT.
+ bigWrites bool
+
+ // dontMask if filestestem does not apply umask to creation modes.
+ // Negotiated in INIT.
+ dontMask bool
+
+ // noOpen if FUSE server doesn't support open operation.
+ // This flag only influence performance, not correctness of the program.
+ noOpen bool
+}
+
+func (conn *connection) saveInitializedChan() bool {
+ select {
+ case <-conn.initializedChan:
+ return true // Closed.
+ default:
+ return false // Not closed.
+ }
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+ conn.initializedChan = make(chan struct{}, 1)
+ if closed {
+ close(conn.initializedChan)
+ }
+}
+
+// newFUSEConnection creates a FUSE connection to fd.
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
+ // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
+ // mount a FUSE filesystem.
+ fuseFD := fd.Impl().(*DeviceFD)
+
+ // Create the writeBuf for the header to be stored in.
+ hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ fuseFD.writeBuf = make([]byte, hdrLen)
+ fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
+ fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
+ fuseFD.writeCursor = 0
+
+ return &connection{
+ fd: fuseFD,
+ asyncNumMax: fuseDefaultMaxBackground,
+ asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+ maxRead: opts.maxRead,
+ maxPages: fuseDefaultMaxPagesPerReq,
+ initializedChan: make(chan struct{}),
+ connected: true,
+ }, nil
+}
+
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+ r.async = true
+ _, err := conn.Call(t, r)
+ return err
+}
+
+// Call makes a request to the server.
+// Block before the connection is initialized.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
+ // Block requests sent before connection is initalized.
+ if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+ if err := t.Block(conn.initializedChan); err != nil {
+ return nil, err
+ }
+ }
+
+ if !conn.connected {
+ return nil, syserror.ENOTCONN
+ }
+
+ if conn.connInitError {
+ return nil, syserror.ECONNREFUSED
+ }
+
+ fut, err := conn.callFuture(t, r)
+ if err != nil {
+ return nil, err
+ }
+
+ return fut.resolve(t)
+}
+
+// callFuture makes a request to the server and returns a future response.
+// Call resolve() when the response needs to be fulfilled.
+func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+
+ // Is the queue full?
+ //
+ // We must busy wait here until the request can be queued. We don't
+ // block on the fd.fullQueueCh with a lock - so after being signalled,
+ // before we acquire the lock, it is possible that a barging task enters
+ // and queues a request. As a result, upon acquiring the lock we must
+ // again check if the room is available.
+ //
+ // This can potentially starve a request forever but this can only happen
+ // if there are always too many ongoing requests all the time. The
+ // supported maxActiveRequests setting should be really high to avoid this.
+ for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+ log.Infof("Blocking request %v from being queued. Too many active requests: %v",
+ r.id, conn.fd.numActiveRequests)
+ conn.fd.mu.Unlock()
+ err := t.Block(conn.fd.fullQueueCh)
+ conn.fd.mu.Lock()
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return conn.callFutureLocked(t, r)
+}
+
+// callFutureLocked makes a request to the server and returns a future response.
+func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+ // Check connected again holding conn.mu.
+ conn.mu.Lock()
+ if !conn.connected {
+ conn.mu.Unlock()
+ // we checked connected before,
+ // this must be due to aborted connection.
+ return nil, syserror.ECONNABORTED
+ }
+ conn.mu.Unlock()
+
+ conn.fd.queue.PushBack(r)
+ conn.fd.numActiveRequests++
+ fut := newFutureResponse(r)
+ conn.fd.completions[r.id] = fut
+
+ // Signal the readers that there is something to read.
+ conn.fd.waitQueue.Notify(waiter.EventIn)
+
+ return fut, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/connection_control.go
new file mode 100644
index 000000000..bfde78559
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -0,0 +1,247 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// consts used by FUSE_INIT negotiation.
+const (
+ // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMaxMaxPages = 256
+
+ // Maximum value for the time granularity for file time stamps, 1s.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMaxTimeGranNs = 1000000000
+
+ // Minimum value for MaxWrite and MaxRead.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMinMaxWrite = 4096
+ fuseMinMaxRead = 4096
+
+ // Temporary default value for max readahead, 128kb.
+ fuseDefaultMaxReadahead = 131072
+
+ // The FUSE_INIT_IN flags sent to the daemon.
+ // TODO(gvisor.dev/issue/3199): complete the flags.
+ fuseDefaultInitFlags = linux.FUSE_MAX_PAGES
+)
+
+// Adjustable maximums for Connection's cogestion control parameters.
+// Used as the upperbound of the config values.
+// Currently we do not support adjustment to them.
+var (
+ MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground
+ MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
+)
+
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+ // Unblock the requests sent before INIT.
+ close(conn.initializedChan)
+
+ // Close the channel first to avoid the non-atomic situation
+ // where conn.initialized is true but there are
+ // tasks being blocked on the channel.
+ // And it prevents the newer tasks from gaining
+ // unnecessary higher chance to be issued before the blocked one.
+
+ atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+ return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// InitSend sends a FUSE_INIT request.
+func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
+ in := linux.FUSEInitIn{
+ Major: linux.FUSE_KERNEL_VERSION,
+ Minor: linux.FUSE_KERNEL_MINOR_VERSION,
+ // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this
+ MaxReadahead: fuseDefaultMaxReadahead,
+ Flags: fuseDefaultInitFlags,
+ }
+
+ req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in)
+ if err != nil {
+ return err
+ }
+
+ // Since there is no task to block on and FUSE_INIT is the request
+ // to unblock other requests, use nil.
+ return conn.CallAsync(nil, req)
+}
+
+// InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
+func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ initRes := fuseInitRes{initLen: res.DataLen()}
+ if err := res.UnmarshalPayload(&initRes); err != nil {
+ return err
+ }
+
+ return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
+}
+
+// Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
+func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+ // No matter error or not, always set initialzied.
+ // to unblock the blocked requests.
+ defer conn.SetInitialized()
+
+ // No support for old major fuse versions.
+ if out.Major != linux.FUSE_KERNEL_VERSION {
+ conn.connInitError = true
+ return nil
+ }
+
+ // Start processing the reply.
+ conn.connInitSuccess = true
+ conn.minor = out.Minor
+
+ // No support for negotiating MaxWrite before minor version 5.
+ if out.Minor >= 5 {
+ conn.maxWrite = out.MaxWrite
+ } else {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+ if conn.maxWrite < fuseMinMaxWrite {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+
+ // No support for the following flags before minor version 6.
+ if out.Minor >= 6 {
+ conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0
+ conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
+ conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
+ conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+
+ // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
+
+ if out.Flags&linux.FUSE_MAX_PAGES != 0 {
+ maxPages := out.MaxPages
+ if maxPages < 1 {
+ maxPages = 1
+ }
+ if maxPages > fuseMaxMaxPages {
+ maxPages = fuseMaxMaxPages
+ }
+ conn.maxPages = maxPages
+ }
+ }
+
+ // No support for limits before minor version 13.
+ if out.Minor >= 13 {
+ conn.asyncMu.Lock()
+
+ if out.MaxBackground > 0 {
+ conn.asyncNumMax = out.MaxBackground
+
+ if !hasSysAdminCap &&
+ conn.asyncNumMax > MaxUserBackgroundRequest {
+ conn.asyncNumMax = MaxUserBackgroundRequest
+ }
+ }
+
+ if out.CongestionThreshold > 0 {
+ conn.asyncCongestionThreshold = out.CongestionThreshold
+
+ if !hasSysAdminCap &&
+ conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+ conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+ }
+ }
+
+ conn.asyncMu.Unlock()
+ }
+
+ return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+ conn.mu.Lock()
+ conn.asyncMu.Lock()
+
+ if !conn.connected {
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+ conn.fd.mu.Unlock()
+ return
+ }
+
+ conn.connected = false
+
+ // Empty the `fd.queue` that holds the requests
+ // not yet read by the FUSE daemon yet.
+ // These are a subset of the requests in `fuse.completion` map.
+ for !conn.fd.queue.Empty() {
+ req := conn.fd.queue.Front()
+ conn.fd.queue.Remove(req)
+ }
+
+ var terminate []linux.FUSEOpID
+
+ // 2. Collect the requests have not been sent to FUSE daemon,
+ // or have not received a reply.
+ for unique := range conn.fd.completions {
+ terminate = append(terminate, unique)
+ }
+
+ // Release locks to avoid deadlock.
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+
+ // 1. The requets blocked before initialization.
+ // Will reach call() `connected` check and return.
+ if !conn.Initialized() {
+ conn.SetInitialized()
+ }
+
+ // 2. Terminate the requests collected above.
+ // Set ECONNABORTED error.
+ // sendError() will remove them from `fd.completion` map.
+ // Will enter the path of a normally received error.
+ for _, toTerminate := range terminate {
+ conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+ }
+
+ // 3. The requests not yet written to FUSE device.
+ // Early terminate.
+ // Will reach callFutureLocked() `connected` check and return.
+ close(conn.fd.fullQueueCh)
+
+ // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
+}
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "math/rand"
+ "syscall"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+
+ conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ select {
+ case <-conn.initializedChan:
+ t.Fatalf("initializedChan should be blocking before SetInitialized")
+ default:
+ }
+
+ conn.SetInitialized()
+
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+}
+
+func TestConnectionAbort(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+ creds := auth.CredentialsFromContext(s.Ctx)
+ task := kernel.TaskFromContext(s.Ctx)
+
+ const numRequests uint64 = 256
+
+ conn, _, err := newTestConnection(s, k, numRequests)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ testObj := &testPayload{
+ data: rand.Uint32(),
+ }
+
+ var futNormal []*futureResponse
+
+ for i := 0; i < int(numRequests); i++ {
+ req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ fut, err := conn.callFutureLocked(task, req)
+ if err != nil {
+ t.Fatalf("callFutureLocked failed: %v", err)
+ }
+ futNormal = append(futNormal, fut)
+ }
+
+ conn.Abort(s.Ctx)
+
+ // Abort should unblock the initialization channel.
+ // Note: no test requests are actually blocked on `conn.initializedChan`.
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+
+ // Abort will return ECONNABORTED error to unblocked requests.
+ for _, fut := range futNormal {
+ if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+ t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+ }
+ }
+
+ // After abort, Call() should return directly with ENOTCONN.
+ req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ _, err = conn.Call(task, req)
+ if err != syserror.ENOTCONN {
+ t.Fatalf("Incorrect error code received for Call() after connection aborted")
+ }
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
new file mode 100644
index 000000000..1b86a4b4c
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -0,0 +1,463 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const fuseDevMinor = 229
+
+// fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
+type fuseDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if !kernel.FUSEEnabled {
+ return nil, syserror.ENOENT
+ }
+
+ var fd DeviceFD
+ if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
+type DeviceFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ // nextOpID is used to create new requests.
+ nextOpID linux.FUSEOpID
+
+ // queue is the list of requests that need to be processed by the FUSE server.
+ queue requestList
+
+ // numActiveRequests is the number of requests made by the Sentry that has
+ // yet to be responded to.
+ numActiveRequests uint64
+
+ // completions is used to map a request to its response. A Writer will use this
+ // to notify the caller of a completed response.
+ completions map[linux.FUSEOpID]*futureResponse
+
+ writeCursor uint32
+
+ // writeBuf is the memory buffer used to copy in the FUSE out header from
+ // userspace.
+ writeBuf []byte
+
+ // writeCursorFR current FR being copied from server.
+ writeCursorFR *futureResponse
+
+ // mu protects all the queues, maps, buffers and cursors and nextOpID.
+ mu sync.Mutex `state:"nosave"`
+
+ // waitQueue is used to notify interested parties when the device becomes
+ // readable or writable.
+ waitQueue waiter.Queue
+
+ // fullQueueCh is a channel used to synchronize the readers with the writers.
+ // Writers (inbound requests to the filesystem) block if there are too many
+ // unprocessed in-flight requests.
+ fullQueueCh chan struct{} `state:".(int)"`
+
+ // fs is the FUSE filesystem that this FD is being used for.
+ fs *filesystem
+}
+
+func (fd *DeviceFD) saveFullQueueCh() int {
+ return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+ fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DeviceFD) Release(ctx context.Context) {
+ if fd.fs != nil {
+ fd.fs.conn.mu.Lock()
+ fd.fs.conn.connected = false
+ fd.fs.conn.mu.Unlock()
+
+ fd.fs.VFSFilesystem().DecRef(ctx)
+ fd.fs = nil
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if fd.fs == nil {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if fd.fs == nil {
+ return 0, syserror.EPERM
+ }
+
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+ return 0, syserror.ENODEV
+ }
+
+ // We require that any Read done on this filesystem have a sane minimum
+ // read buffer. It must have the capacity for the fixed parts of any request
+ // header (Linux uses the request header and the FUSEWriteIn header for this
+ // calculation) + the negotiated MaxWrite room for the data.
+ minBuffSize := linux.FUSE_MIN_READ_BUFFER
+ inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+ writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
+ negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
+ if minBuffSize < negotiatedMinBuffSize {
+ minBuffSize = negotiatedMinBuffSize
+ }
+
+ // If the read buffer is too small, error out.
+ if dst.NumBytes() < int64(minBuffSize) {
+ return 0, syserror.EINVAL
+ }
+
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.readLocked(ctx, dst, opts)
+}
+
+// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
+func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ var req *Request
+
+ // Find the first valid request.
+ // For the normal case this loop only execute once.
+ for !fd.queue.Empty() {
+ req = fd.queue.Front()
+
+ if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+ break
+ }
+
+ // The request is too large. Cannot process it. All requests must be smaller than the
+ // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+ // handshake.
+ errno := -int32(syscall.EIO)
+ if req.hdr.Opcode == linux.FUSE_SETXATTR {
+ errno = -int32(syscall.E2BIG)
+ }
+
+ // Return the error to the calling task.
+ if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+ return 0, err
+ }
+
+ // We're done with this request.
+ fd.queue.Remove(req)
+ req = nil
+ }
+
+ if req == nil {
+ return 0, syserror.ErrWouldBlock
+ }
+
+ // We already checked the size: dst must be able to fit the whole request.
+ // Now we write the marshalled header, the payload,
+ // and the potential additional payload
+ // to the user memory IOSequence.
+
+ n, err := dst.CopyOut(ctx, req.data)
+ if err != nil {
+ return 0, err
+ }
+ if n != len(req.data) {
+ return 0, syserror.EIO
+ }
+
+ if req.hdr.Opcode == linux.FUSE_WRITE {
+ written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
+ if err != nil {
+ return 0, err
+ }
+ if written != len(req.payload) {
+ return 0, syserror.EIO
+ }
+ n += int(written)
+ }
+
+ // Fully done with this req, remove it from the queue.
+ fd.queue.Remove(req)
+
+ // Remove noReply ones from map of requests expecting a reply.
+ if req.noReply {
+ fd.numActiveRequests -= 1
+ delete(fd.completions, req.hdr.Unique)
+ }
+
+ return int64(n), nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if fd.fs == nil {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.writeLocked(ctx, src, opts)
+}
+
+// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if fd.fs == nil {
+ return 0, syserror.EPERM
+ }
+
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ return 0, syserror.ENODEV
+ }
+
+ var cn, n int64
+ hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+
+ for src.NumBytes() > 0 {
+ if fd.writeCursorFR != nil {
+ // Already have common header, and we're now copying the payload.
+ wantBytes := fd.writeCursorFR.hdr.Len
+
+ // Note that the FR data doesn't have the header. Copy it over if its necessary.
+ if fd.writeCursorFR.data == nil {
+ fd.writeCursorFR.data = make([]byte, wantBytes)
+ }
+
+ bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
+ if err != nil {
+ return 0, err
+ }
+ src = src.DropFirst(bytesCopied)
+
+ cn = int64(bytesCopied)
+ n += cn
+ fd.writeCursor += uint32(cn)
+ if fd.writeCursor == wantBytes {
+ // Done reading this full response. Clean up and unblock the
+ // initiator.
+ break
+ }
+
+ // Check if we have more data in src.
+ continue
+ }
+
+ // Assert that the header isn't read into the writeBuf yet.
+ if fd.writeCursor >= hdrLen {
+ return 0, syserror.EINVAL
+ }
+
+ // We don't have the full common response header yet.
+ wantBytes := hdrLen - fd.writeCursor
+ bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
+ if err != nil {
+ return 0, err
+ }
+ src = src.DropFirst(bytesCopied)
+
+ cn = int64(bytesCopied)
+ n += cn
+ fd.writeCursor += uint32(cn)
+ if fd.writeCursor == hdrLen {
+ // Have full header in the writeBuf. Use it to fetch the actual futureResponse
+ // from the device's completions map.
+ var hdr linux.FUSEHeaderOut
+ hdr.UnmarshalBytes(fd.writeBuf)
+
+ // We have the header now and so the writeBuf has served its purpose.
+ // We could reset it manually here but instead of doing that, at the
+ // end of the write, the writeCursor will be set to 0 thereby allowing
+ // the next request to overwrite whats in the buffer,
+
+ fut, ok := fd.completions[hdr.Unique]
+ if !ok {
+ // Server sent us a response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted), an unlikely event.
+ return 0, syserror.EINVAL
+ }
+
+ delete(fd.completions, hdr.Unique)
+
+ // Copy over the header into the future response. The rest of the payload
+ // will be copied over to the FR's data in the next iteration.
+ fut.hdr = &hdr
+ fd.writeCursorFR = fut
+
+ // Next iteration will now try read the complete request, if src has
+ // any data remaining. Otherwise we're done.
+ }
+ }
+
+ if fd.writeCursorFR != nil {
+ if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
+ return 0, err
+ }
+
+ // Ready the device for the next request.
+ fd.writeCursorFR = nil
+ fd.writeCursor = 0
+ }
+
+ return n, nil
+}
+
+// Readiness implements vfs.FileDescriptionImpl.Readiness.
+func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
+ var ready waiter.EventMask
+
+ if fd.fs.umounted {
+ ready |= waiter.EventErr
+ return ready & mask
+ }
+
+ // FD is always writable.
+ ready |= waiter.EventOut
+ if !fd.queue.Empty() {
+ // Have reqs available, FD is readable.
+ ready |= waiter.EventIn
+ }
+
+ return ready & mask
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.waitQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
+ fd.waitQueue.EventUnregister(e)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if fd.fs == nil {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
+ // Signal the task waiting on a response if any.
+ defer close(fut.ch)
+
+ // Signal that the queue is no longer full.
+ select {
+ case fd.fullQueueCh <- struct{}{}:
+ default:
+ }
+ fd.numActiveRequests--
+
+ if fut.async {
+ return fd.asyncCallBack(ctx, fut.getResponse())
+ }
+
+ return nil
+}
+
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
+ // Return the error to the calling task.
+ outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ respHdr := linux.FUSEHeaderOut{
+ Len: outHdrLen,
+ Error: errno,
+ Unique: unique,
+ }
+
+ fut, ok := fd.completions[respHdr.Unique]
+ if !ok {
+ // A response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted).
+ return syserror.EINVAL
+ }
+ delete(fd.completions, respHdr.Unique)
+
+ fut.hdr = &respHdr
+ return fd.sendResponse(ctx, fut)
+}
+
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+ switch r.opcode {
+ case linux.FUSE_INIT:
+ creds := auth.CredentialsFromContext(ctx)
+ rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
+ return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+ // TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
new file mode 100644
index 000000000..5986133e9
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -0,0 +1,323 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "fmt"
+ "math/rand"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// echoTestOpcode is the Opcode used during testing. The server used in tests
+// will simply echo the payload back with the appropriate headers.
+const echoTestOpcode linux.FUSEOpcode = 1000
+
+// TestFUSECommunication tests that the communication layer between the Sentry and the
+// FUSE server daemon works as expected.
+func TestFUSECommunication(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+ creds := auth.CredentialsFromContext(s.Ctx)
+
+ // Create test cases with different number of concurrent clients and servers.
+ testCases := []struct {
+ Name string
+ NumClients int
+ NumServers int
+ MaxActiveRequests uint64
+ }{
+ {
+ Name: "SingleClientSingleServer",
+ NumClients: 1,
+ NumServers: 1,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "SingleClientMultipleServers",
+ NumClients: 1,
+ NumServers: 10,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "MultipleClientsSingleServer",
+ NumClients: 10,
+ NumServers: 1,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "MultipleClientsMultipleServers",
+ NumClients: 10,
+ NumServers: 10,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "RequestCapacityFull",
+ NumClients: 10,
+ NumServers: 1,
+ MaxActiveRequests: 1,
+ },
+ {
+ Name: "RequestCapacityContinuouslyFull",
+ NumClients: 100,
+ NumServers: 2,
+ MaxActiveRequests: 2,
+ },
+ }
+
+ for _, testCase := range testCases {
+ t.Run(testCase.Name, func(t *testing.T) {
+ conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ clientsDone := make([]chan struct{}, testCase.NumClients)
+ serversDone := make([]chan struct{}, testCase.NumServers)
+ serversKill := make([]chan struct{}, testCase.NumServers)
+
+ // FUSE clients.
+ for i := 0; i < testCase.NumClients; i++ {
+ clientsDone[i] = make(chan struct{})
+ go func(i int) {
+ fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i])
+ }(i)
+ }
+
+ // FUSE servers.
+ for j := 0; j < testCase.NumServers; j++ {
+ serversDone[j] = make(chan struct{})
+ serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block.
+ go func(j int) {
+ fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j])
+ }(j)
+ }
+
+ // Tear down.
+ //
+ // Make sure all the clients are done.
+ for i := 0; i < testCase.NumClients; i++ {
+ <-clientsDone[i]
+ }
+
+ // Kill any server that is potentially waiting.
+ for j := 0; j < testCase.NumServers; j++ {
+ serversKill[j] <- struct{}{}
+ }
+
+ // Make sure all the servers are done.
+ for j := 0; j < testCase.NumServers; j++ {
+ <-serversDone[j]
+ }
+ })
+ }
+}
+
+// CallTest makes a request to the server and blocks the invoking
+// goroutine until a server responds with a response. Doesn't block
+// a kernel.Task. Analogous to Connection.Call but used for testing.
+func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) {
+ conn.fd.mu.Lock()
+
+ // Wait until we're certain that a new request can be processed.
+ for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+ conn.fd.mu.Unlock()
+ select {
+ case <-conn.fd.fullQueueCh:
+ }
+ conn.fd.mu.Lock()
+ }
+
+ fut, err := conn.callFutureLocked(t, r) // No task given.
+ conn.fd.mu.Unlock()
+
+ if err != nil {
+ return nil, err
+ }
+
+ // Resolve the response.
+ //
+ // Block without a task.
+ select {
+ case <-fut.ch:
+ }
+
+ // A response is ready. Resolve and return it.
+ return fut.getResponse(), nil
+}
+
+// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE
+// device. However, it does so by - not blocking the task that is calling - and
+// instead just waits on a channel. The behaviour is essentially the same as
+// DeviceFD.Read except it guarantees that the task is not blocked.
+func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) {
+ var err error
+ var n, total int64
+
+ dev := fd.Impl().(*DeviceFD)
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ dev.EventRegister(&w, waiter.EventIn)
+ for {
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ // Emulate the blocking for when no requests are available
+ select {
+ case <-ch:
+ case <-killServer:
+ // Server killed by the main program.
+ return 0, true, nil
+ }
+ }
+
+ dev.EventUnregister(&w)
+ return total, false, err
+}
+
+// fuseClientRun emulates all the actions of a normal FUSE request. It creates
+// a header, a payload, calls the server, waits for the response, and processes
+// the response.
+func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) {
+ defer func() { clientDone <- struct{}{} }()
+
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root)
+ if err != nil {
+ t.Fatal(err)
+ }
+ testObj := &testPayload{
+ data: rand.Uint32(),
+ }
+
+ req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+
+ // Queue up a request.
+ // Analogous to Call except it doesn't block on the task.
+ resp, err := CallTest(conn, clientTask, req, pid)
+ if err != nil {
+ t.Fatalf("CallTaskNonBlock failed: %v", err)
+ }
+
+ if err = resp.Error(); err != nil {
+ t.Fatalf("Server responded with an error: %v", err)
+ }
+
+ var respTestPayload testPayload
+ if err := resp.UnmarshalPayload(&respTestPayload); err != nil {
+ t.Fatalf("Unmarshalling payload error: %v", err)
+ }
+
+ if resp.hdr.Unique != req.hdr.Unique {
+ t.Fatalf("got response for another request. Expected response for req %v but got response for req %v",
+ req.hdr.Unique, resp.hdr.Unique)
+ }
+
+ if respTestPayload.data != testObj.data {
+ t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data)
+ }
+
+}
+
+// fuseServerRun creates a task and emulates all the actions of a simple FUSE server
+// that simply reads a request and echos the same struct back as a response using the
+// appropriate headers.
+func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) {
+ defer func() { serverDone <- struct{}{} }()
+
+ // Create the tasks that the server will be using.
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ var readPayload testPayload
+
+ serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Read the request.
+ for {
+ inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+ payloadLen := uint32(readPayload.SizeBytes())
+
+ // The raed buffer must meet some certain size criteria.
+ buffSize := inHdrLen + payloadLen
+ if buffSize < linux.FUSE_MIN_READ_BUFFER {
+ buffSize = linux.FUSE_MIN_READ_BUFFER
+ }
+ inBuf := make([]byte, buffSize)
+ inIOseq := usermem.BytesIOSequence(inBuf)
+
+ n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer)
+ if err != nil {
+ t.Fatalf("Read failed :%v", err)
+ }
+
+ // Server should shut down. No new requests are going to be made.
+ if serverKilled {
+ break
+ }
+
+ if n <= 0 {
+ t.Fatalf("Read read no bytes")
+ }
+
+ var readFUSEHeaderIn linux.FUSEHeaderIn
+ readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen])
+ readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen])
+
+ if readFUSEHeaderIn.Opcode != echoTestOpcode {
+ t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload)
+ }
+
+ // Write the response.
+ outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ outBuf := make([]byte, outHdrLen+payloadLen)
+ outHeader := linux.FUSEHeaderOut{
+ Len: outHdrLen + payloadLen,
+ Error: 0,
+ Unique: readFUSEHeaderIn.Unique,
+ }
+
+ // Echo the payload back.
+ outHeader.MarshalUnsafe(outBuf[:outHdrLen])
+ readPayload.MarshalUnsafe(outBuf[outHdrLen:])
+ outIOseq := usermem.BytesIOSequence(outBuf)
+
+ n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("Write failed :%v", err)
+ }
+ }
+}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+ fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+ fusefs := dir.inode().fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEReadIn{
+ Fh: dir.Fh,
+ Offset: uint64(atomic.LoadInt64(&dir.off)),
+ Size: linux.FUSE_PAGE_SIZE,
+ Flags: dir.statusFlags(),
+ }
+
+ // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := fusefs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ var out linux.FUSEDirents
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ for _, fuseDirent := range out.Dirents {
+ nextOff := int64(fuseDirent.Meta.Off)
+ dirent := vfs.Dirent{
+ Name: fuseDirent.Name,
+ Type: uint8(fuseDirent.Meta.Type),
+ Ino: fuseDirent.Meta.Ino,
+ NextOff: nextOff,
+ }
+
+ if err := callback.Handle(dirent); err != nil {
+ return err
+ }
+ atomic.StoreInt64(&dir.off, nextOff)
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ // the file handle used in userspace.
+ Fh uint64
+
+ // Nonseekable is indicate cannot perform seek on a file.
+ Nonseekable bool
+
+ // DirectIO suggest fuse to use direct io operation.
+ DirectIO bool
+
+ // OpenFlag is the flag returned by open.
+ OpenFlag uint32
+
+ // off is the file offset.
+ off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+ return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+ return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+ // no need to release if FUSE server doesn't implement Open.
+ conn := fd.inode().fs.conn
+ if conn.noOpen {
+ return
+ }
+
+ in := linux.FUSEReleaseIn{
+ Fh: fd.Fh,
+ Flags: fd.statusFlags(),
+ }
+ // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+ var opcode linux.FUSEOpcode
+ if fd.inode().Mode().IsDir() {
+ opcode = linux.FUSE_RELEASEDIR
+ } else {
+ opcode = linux.FUSE_RELEASE
+ }
+ kernelTask := kernel.TaskFromContext(ctx)
+ // ignoring errors and FUSE server reply is analogous to Linux's behavior.
+ req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+ if err != nil {
+ // No way to invoke Call() with an errored request.
+ return
+ }
+ // The reply will be ignored since no callback is defined in asyncCallBack().
+ conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.filesystem()
+ inode := fd.inode()
+ return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ fs := fd.filesystem()
+ creds := auth.CredentialsFromContext(ctx)
+ return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
new file mode 100644
index 000000000..e7ef5998e
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -0,0 +1,822 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fuse implements fusefs.
+package fuse
+
+import (
+ "math"
+ "strconv"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Name is the default filesystem name.
+const Name = "fuse"
+
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// +stateify savable
+type filesystemOptions struct {
+ // userID specifies the numeric uid of the mount owner.
+ // This option should not be specified by the filesystem owner.
+ // It is set by libfuse (or, if libfuse is not used, must be set
+ // by the filesystem itself). For more information, see man page
+ // for fuse(8)
+ userID uint32
+
+ // groupID specifies the numeric gid of the mount owner.
+ // This option should not be specified by the filesystem owner.
+ // It is set by libfuse (or, if libfuse is not used, must be set
+ // by the filesystem itself). For more information, see man page
+ // for fuse(8)
+ groupID uint32
+
+ // rootMode specifies the the file mode of the filesystem's root.
+ rootMode linux.FileMode
+
+ // maxActiveRequests specifies the maximum number of active requests that can
+ // exist at any time. Any further requests will block when trying to
+ // Call the server.
+ maxActiveRequests uint64
+
+ // maxRead is the max number of bytes to read,
+ // specified as "max_read" in fs parameters.
+ // If not specified by user, use math.MaxUint32 as default value.
+ maxRead uint32
+}
+
+// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
+type filesystem struct {
+ kernfs.Filesystem
+ devMinor uint32
+
+ // conn is used for communication between the FUSE server
+ // daemon and the sentry fusefs.
+ conn *connection
+
+ // opts is the options the fusefs is initialized with.
+ opts *filesystemOptions
+
+ // umounted is true if filesystem.Release() has been called.
+ umounted bool
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var fsopts filesystemOptions
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ deviceDescriptorStr, ok := mopts["fd"]
+ if !ok {
+ log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name())
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "fd")
+
+ deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
+ return nil, nil, syserror.EINVAL
+ }
+ fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor))
+
+ // Parse and set all the other supported FUSE mount options.
+ // TODO(gVisor.dev/issue/3229): Expand the supported mount options.
+ if userIDStr, ok := mopts["user_id"]; ok {
+ delete(mopts, "user_id")
+ userID, err := strconv.ParseUint(userIDStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.userID = uint32(userID)
+ }
+
+ if groupIDStr, ok := mopts["group_id"]; ok {
+ delete(mopts, "group_id")
+ groupID, err := strconv.ParseUint(groupIDStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.groupID = uint32(groupID)
+ }
+
+ rootMode := linux.FileMode(0777)
+ modeStr, ok := mopts["rootmode"]
+ if ok {
+ delete(mopts, "rootmode")
+ mode, err := strconv.ParseUint(modeStr, 8, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
+ return nil, nil, syserror.EINVAL
+ }
+ rootMode = linux.FileMode(mode)
+ }
+ fsopts.rootMode = rootMode
+
+ // Set the maxInFlightRequests option.
+ fsopts.maxActiveRequests = maxActiveRequestsDefault
+
+ if maxReadStr, ok := mopts["max_read"]; ok {
+ delete(mopts, "max_read")
+ maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+ return nil, nil, syserror.EINVAL
+ }
+ if maxRead < fuseMinMaxRead {
+ maxRead = fuseMinMaxRead
+ }
+ fsopts.maxRead = uint32(maxRead)
+ } else {
+ fsopts.maxRead = math.MaxUint32
+ }
+
+ // Check for unparsed options.
+ if len(mopts) != 0 {
+ log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Create a new FUSE filesystem.
+ fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+ if err != nil {
+ log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
+ return nil, nil, err
+ }
+
+ fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+ // Send a FUSE_INIT request to the FUSE daemon server before returning.
+ // This call is not blocking.
+ if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
+ log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
+ return nil, nil, err
+ }
+
+ // root is the fusefs root directory.
+ root := fs.newRootInode(ctx, creds, fsopts.rootMode)
+
+ return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+ conn, err := newFUSEConnection(ctx, device, opts)
+ if err != nil {
+ log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
+ return nil, syserror.EINVAL
+ }
+
+ fuseFD := device.Impl().(*DeviceFD)
+
+ fs := &filesystem{
+ devMinor: devMinor,
+ opts: opts,
+ conn: conn,
+ }
+
+ fs.VFSFilesystem().IncRef()
+ fuseFD.fs = fs
+
+ return fs, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ fs.conn.fd.mu.Lock()
+
+ fs.umounted = true
+ fs.conn.Abort(ctx)
+ // Notify all the waiters on this fd.
+ fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+ fs.conn.fd.mu.Unlock()
+
+ fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+ fs.Filesystem.Release(ctx)
+}
+
+// inode implements kernfs.Inode.
+//
+// +stateify savable
+type inode struct {
+ inodeRefs
+ kernfs.InodeAlwaysValid
+ kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.OrderedChildren
+
+ // the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // metaDataMu protects the metadata of this inode.
+ metadataMu sync.Mutex
+
+ nodeID uint64
+
+ locks vfs.FileLocks
+
+ // size of the file.
+ size uint64
+
+ // attributeVersion is the version of inode's attributes.
+ attributeVersion uint64
+
+ // attributeTime is the remaining vaild time of attributes.
+ attributeTime uint64
+
+ // version of the inode.
+ version uint64
+
+ // link is result of following a symbolic link.
+ link string
+}
+
+func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+ i := &inode{fs: fs, nodeID: 1}
+ i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
+ i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ i.EnableLeakCheck()
+
+ var d kernfs.Dentry
+ d.Init(&fs.Filesystem, i)
+ return &d
+}
+
+func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
+ i := &inode{fs: fs, nodeID: nodeID}
+ creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+ i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+ atomic.StoreUint64(&i.size, attr.Size)
+ i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ i.EnableLeakCheck()
+ return i
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ isDir := i.InodeAttrs.Mode().IsDir()
+ // return error if specified to open directory but inode is not a directory.
+ if !isDir && opts.Mode.IsDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+ return nil, syserror.EOVERFLOW
+ }
+
+ var fd *fileDescription
+ var fdImpl vfs.FileDescriptionImpl
+ if isDir {
+ directoryFD := &directoryFD{}
+ fd = &(directoryFD.fileDescription)
+ fdImpl = directoryFD
+ } else {
+ regularFD := &regularFileFD{}
+ fd = &(regularFD.fileDescription)
+ fdImpl = regularFD
+ }
+ // FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+ fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+ // Only send open request when FUSE server support open or is opening a directory.
+ if !i.fs.conn.noOpen || isDir {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+ return nil, syserror.EINVAL
+ }
+
+ // Build the request.
+ var opcode linux.FUSEOpcode
+ if isDir {
+ opcode = linux.FUSE_OPENDIR
+ } else {
+ opcode = linux.FUSE_OPEN
+ }
+
+ in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+ if !i.fs.conn.atomicOTrunc {
+ in.Flags &= ^uint32(linux.O_TRUNC)
+ }
+
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+ if err != nil {
+ return nil, err
+ }
+
+ // Send the request and receive the reply.
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return nil, err
+ }
+ if err := res.Error(); err == syserror.ENOSYS && !isDir {
+ i.fs.conn.noOpen = true
+ } else if err != nil {
+ return nil, err
+ } else {
+ out := linux.FUSEOpenOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return nil, err
+ }
+
+ // Process the reply.
+ fd.OpenFlag = out.OpenFlag
+ if isDir {
+ fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+ }
+
+ fd.Fh = out.Fh
+ }
+ }
+
+ // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+ fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+ fdOptions := &vfs.FileDescriptionOptions{}
+ if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+ fdOptions.DenyPRead = true
+ fdOptions.DenyPWrite = true
+ fd.Nonseekable = true
+ }
+
+ // If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+ // and O_TRUNC is set, update the inode's version number and clean existing data
+ // by setting the file size to 0.
+ if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+ i.fs.conn.mu.Lock()
+ i.fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, 0)
+ i.fs.conn.mu.Unlock()
+ i.attributeTime = 0
+ }
+
+ if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ in := linux.FUSELookupIn{Name: name}
+ return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (i *inode) Keep() bool {
+ // Return true so that kernfs keeps the new dentry pointing to this
+ // inode in the dentry tree. This is needed because inodes created via
+ // Lookup are not temporary. They might refer to existing files on server
+ // that can be Unlink'd/Rmdir'd.
+ return true
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return offset, nil
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ in := linux.FUSECreateIn{
+ CreateMeta: linux.FUSECreateMeta{
+ Flags: opts.Flags,
+ Mode: uint32(opts.Mode) | linux.S_IFREG,
+ Umask: uint32(kernelTask.FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
+ in := linux.FUSEMknodIn{
+ MknodMeta: linux.FUSEMknodMeta{
+ Mode: uint32(opts.Mode),
+ Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
+ in := linux.FUSESymLinkIn{
+ Name: name,
+ Target: target,
+ }
+ return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return syserror.EINVAL
+ }
+ in := linux.FUSEUnlinkIn{Name: name}
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
+ if err != nil {
+ return err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return err
+ }
+ // only return error, discard res.
+ return res.Error()
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+ in := linux.FUSEMkdirIn{
+ MkdirMeta: linux.FUSEMkdirMeta{
+ Mode: uint32(opts.Mode),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+ fusefs := i.fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSERmDirIn{Name: name}
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := i.fs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ return res.Error()
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+ if err != nil {
+ return nil, err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return nil, err
+ }
+ if err := res.Error(); err != nil {
+ return nil, err
+ }
+ out := linux.FUSEEntryOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return nil, err
+ }
+ if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+ return nil, syserror.EIO
+ }
+ child := i.fs.newInode(ctx, out.NodeID, out.Attr)
+ return child, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ path, err := i.Readlink(ctx, mnt)
+ return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+ if i.Mode().FileType()&linux.S_IFLNK == 0 {
+ return "", syserror.EINVAL
+ }
+ if len(i.link) == 0 {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+ return "", syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+ if err != nil {
+ return "", err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return "", err
+ }
+ i.link = string(res.data[res.hdr.SizeBytes():])
+ if !mnt.Options().ReadOnly {
+ i.attributeTime = 0
+ }
+ }
+ return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+ return linux.FUSEAttr{
+ Ino: i.Ino(),
+ Size: atomic.LoadUint64(&i.size),
+ Mode: uint32(i.Mode()),
+ }
+}
+
+// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
+// opts.Sync attribute is ignored since the synchronization is handled by the
+// FUSE server.
+func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
+ var stat linux.Statx
+ stat.Blksize = attr.BlkSize
+ stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor
+
+ rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev)
+ stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor
+
+ if mask&linux.STATX_MODE != 0 {
+ stat.Mode = uint16(attr.Mode)
+ }
+ if mask&linux.STATX_NLINK != 0 {
+ stat.Nlink = attr.Nlink
+ }
+ if mask&linux.STATX_UID != 0 {
+ stat.UID = attr.UID
+ }
+ if mask&linux.STATX_GID != 0 {
+ stat.GID = attr.GID
+ }
+ if mask&linux.STATX_ATIME != 0 {
+ stat.Atime = linux.StatxTimestamp{
+ Sec: int64(attr.Atime),
+ Nsec: attr.AtimeNsec,
+ }
+ }
+ if mask&linux.STATX_MTIME != 0 {
+ stat.Mtime = linux.StatxTimestamp{
+ Sec: int64(attr.Mtime),
+ Nsec: attr.MtimeNsec,
+ }
+ }
+ if mask&linux.STATX_CTIME != 0 {
+ stat.Ctime = linux.StatxTimestamp{
+ Sec: int64(attr.Ctime),
+ Nsec: attr.CtimeNsec,
+ }
+ }
+ if mask&linux.STATX_INO != 0 {
+ stat.Ino = attr.Ino
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ stat.Size = attr.Size
+ }
+ if mask&linux.STATX_BLOCKS != 0 {
+ stat.Blocks = attr.Blocks
+ }
+ return stat
+}
+
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+ attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+ // TODO(gvisor.dev/issue/3679): send the request only if
+ // - invalid local cache for fields specified in the opts.Mask
+ // - forced update
+ // - i.attributeTime expired
+ // If local cache is still valid, return local cache.
+ // Currently we always send a request,
+ // and we always set the metadata with the new result,
+ // unless attributeVersion has changed.
+
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ log.Warningf("couldn't get kernel task from context")
+ return linux.FUSEAttr{}, syserror.EINVAL
+ }
+
+ creds := auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEGetAttrIn{
+ GetAttrFlags: flags,
+ Fh: fh,
+ }
+ req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
+ if err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ res, err := i.fs.conn.Call(task, req)
+ if err != nil {
+ return linux.FUSEAttr{}, err
+ }
+ if err := res.Error(); err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ var out linux.FUSEGetAttrOut
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ // Local version is newer, return the local one.
+ // Skip the update.
+ if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+ return i.getFUSEAttr(), nil
+ }
+
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+ }); err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ // Set the size if no error (after SetStat() check).
+ atomic.StoreUint64(&i.size, out.Attr.Size)
+
+ return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+ // Never need atime for internal purposes.
+ _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+ Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+ }, flags, fh)
+ return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+ i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+ return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+ var fuseAttrMask uint32
+ maskMap := map[uint32]uint32{
+ linux.STATX_MODE: linux.FATTR_MODE,
+ linux.STATX_UID: linux.FATTR_UID,
+ linux.STATX_GID: linux.FATTR_GID,
+ linux.STATX_SIZE: linux.FATTR_SIZE,
+ linux.STATX_ATIME: linux.FATTR_ATIME,
+ linux.STATX_MTIME: linux.FATTR_MTIME,
+ linux.STATX_CTIME: linux.FATTR_CTIME,
+ }
+ for statxMask, fattrMask := range maskMap {
+ if mask&statxMask != 0 {
+ fuseAttrMask |= fattrMask
+ }
+ }
+ return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+ conn := i.fs.conn
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ log.Warningf("couldn't get kernel task from context")
+ return syserror.EINVAL
+ }
+
+ // We should retain the original file type when assigning new mode.
+ fileType := uint16(i.Mode()) & linux.S_IFMT
+ fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+ if useFh {
+ fattrMask |= linux.FATTR_FH
+ }
+ in := linux.FUSESetAttrIn{
+ Valid: fattrMask,
+ Fh: fh,
+ Size: opts.Stat.Size,
+ Atime: uint64(opts.Stat.Atime.Sec),
+ Mtime: uint64(opts.Stat.Mtime.Sec),
+ Ctime: uint64(opts.Stat.Ctime.Sec),
+ AtimeNsec: opts.Stat.Atime.Nsec,
+ MtimeNsec: opts.Stat.Mtime.Nsec,
+ CtimeNsec: opts.Stat.Ctime.Nsec,
+ Mode: uint32(fileType | opts.Stat.Mode),
+ UID: opts.Stat.UID,
+ GID: opts.Stat.GID,
+ }
+ req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+ out := linux.FUSEGetAttrOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+ }); err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..2d396e84c
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,244 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+ attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return nil, 0, syserror.EINVAL
+ }
+
+ // Round up to a multiple of page size.
+ readSize, _ := usermem.PageRoundUp(uint64(size))
+
+ // One request cannnot exceed either maxRead or maxPages.
+ maxPages := fs.conn.maxRead >> usermem.PageShift
+ if maxPages > uint32(fs.conn.maxPages) {
+ maxPages = uint32(fs.conn.maxPages)
+ }
+
+ var outs [][]byte
+ var sizeRead uint32
+
+ // readSize is a multiple of usermem.PageSize.
+ // Always request bytes as a multiple of pages.
+ pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEReadIn{
+ Fh: fd.Fh,
+ LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+ ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ Flags: fd.statusFlags(),
+ }
+
+ // This loop is intended for fragmented read where the bytes to read is
+ // larger than either the maxPages or maxRead.
+ // For the majority of reads with normal size, this loop should only
+ // execute once.
+ for pagesRead < pagesToRead {
+ pagesCanRead := pagesToRead - pagesRead
+ if pagesCanRead > maxPages {
+ pagesCanRead = maxPages
+ }
+
+ in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+ in.Size = pagesCanRead << usermem.PageShift
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3247): support async read.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return nil, 0, err
+ }
+ if err := res.Error(); err != nil {
+ return nil, 0, err
+ }
+
+ // Not enough bytes in response,
+ // either we reached EOF,
+ // or the FUSE server sends back a response
+ // that cannot even fit the hdr.
+ if len(res.data) <= res.hdr.SizeBytes() {
+ // We treat both case as EOF here for now
+ // since there is no reliable way to detect
+ // the over-short hdr case.
+ break
+ }
+
+ // Directly using the slice to avoid extra copy.
+ out := res.data[res.hdr.SizeBytes():]
+
+ outs = append(outs, out)
+ sizeRead += uint32(len(out))
+
+ pagesRead += pagesCanRead
+ }
+
+ defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+ // No bytes returned: offset >= EOF.
+ if len(outs) == 0 {
+ return nil, 0, io.EOF
+ }
+
+ return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+ // TODO(gvisor.dev/issue/3247): support async read.
+ // If this is called by an async read, correctly process it.
+ // May need to update the signature.
+
+ i := fd.inode()
+ i.InodeAttrs.TouchAtime(ctx, fd.vfsfd.Mount())
+
+ // Reached EOF.
+ if sizeRead < size {
+ // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+ // Might need to update the buf to be returned from the Read().
+
+ // Update existing size.
+ newSize := off + uint64(sizeRead)
+ fs.conn.mu.Lock()
+ if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+ fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, newSize)
+ }
+ fs.conn.mu.Unlock()
+ }
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return 0, syserror.EINVAL
+ }
+
+ // One request cannnot exceed either maxWrite or maxPages.
+ maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+ if maxWrite > fs.conn.maxWrite {
+ maxWrite = fs.conn.maxWrite
+ }
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEWriteIn{
+ Fh: fd.Fh,
+ // TODO(gvisor.dev/issue/3245): file lock
+ LockOwner: 0,
+ // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+ WriteFlags: 0,
+ Flags: fd.statusFlags(),
+ }
+
+ inode := fd.inode()
+ var written uint32
+
+ // This loop is intended for fragmented write where the bytes to write is
+ // larger than either the maxWrite or maxPages or when bigWrites is false.
+ // Unless a small value for max_write is explicitly used, this loop
+ // is expected to execute only once for the majority of the writes.
+ for written < size {
+ toWrite := size - written
+
+ // Limit the write size to one page.
+ // Note that the bigWrites flag is obsolete,
+ // latest libfuse always sets it on.
+ if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+ toWrite = usermem.PageSize
+ }
+
+ // Limit the write size to maxWrite.
+ if toWrite > maxWrite {
+ toWrite = maxWrite
+ }
+
+ in.Offset = off + uint64(written)
+ in.Size = toWrite
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in)
+ if err != nil {
+ return 0, err
+ }
+
+ req.payload = data[written : written+toWrite]
+
+ // TODO(gvisor.dev/issue/3247): support async write.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return 0, err
+ }
+ if err := res.Error(); err != nil {
+ return 0, err
+ }
+
+ out := linux.FUSEWriteOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return 0, err
+ }
+
+ // Write more than requested? EIO.
+ if out.Size > toWrite {
+ return 0, syserror.EIO
+ }
+
+ written += out.Size
+
+ // Break if short write. Not necessarily an error.
+ if out.Size != toWrite {
+ break
+ }
+ }
+ inode.InodeAttrs.TouchCMtime(ctx)
+
+ return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go
new file mode 100644
index 000000000..b5b581152
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/register.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers the FUSE device with vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{
+ GroupName: "misc",
+ }); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// CreateDevtmpfsFile creates a device special file in devtmpfs.
+func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error {
+ if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset.
+ off int64
+ // offMu protects off.
+ offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ size := dst.NumBytes()
+ if size == 0 {
+ // Early return if count is 0.
+ return 0, nil
+ } else if size > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, syserror.EINVAL
+ }
+
+ // TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+ inode := fd.inode()
+
+ // Reading beyond EOF, update file size if outdated.
+ if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+ if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+ return 0, err
+ }
+ // If the offset after update is still too large, return error.
+ if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+ return 0, io.EOF
+ }
+ }
+
+ // Truncate the read with updated file size.
+ fileSize := atomic.LoadUint64(&inode.size)
+ if uint64(offset+size) > fileSize {
+ size = int64(fileSize) - offset
+ }
+
+ buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+ if err != nil {
+ return 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+ // store the bytes that were read ahead.
+
+ // Update the number of bytes to copy for short read.
+ if n < uint32(size) {
+ size = int64(n)
+ }
+
+ // Copy the bytes read to the dst.
+ // This loop is intended for fragmented reads.
+ // For the majority of reads, this loop only execute once.
+ var copied int64
+ for _, buffer := range buffers {
+ toCopy := int64(len(buffer))
+ if copied+toCopy > size {
+ toCopy = size - copied
+ }
+ cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+ if err != nil {
+ return 0, err
+ }
+ if int64(cp) != toCopy {
+ return 0, syserror.EIO
+ }
+ copied += toCopy
+ }
+
+ return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+ if offset < 0 {
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ inode := fd.inode()
+ inode.metadataMu.Lock()
+ defer inode.metadataMu.Unlock()
+
+ // If the file is opened with O_APPEND, update offset to file size.
+ // Note: since our Open() implements the interface of kernfs,
+ // and kernfs currently does not support O_APPEND, this will never
+ // be true before we switch out from kernfs.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Locking inode.metadataMu is sufficient for reading size
+ offset = int64(inode.size)
+ }
+
+ srclen := src.NumBytes()
+
+ if srclen > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+ if end := offset + srclen; end < offset {
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+
+ srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if srclen == 0 {
+ // Return before causing any side effects.
+ return 0, offset, nil
+ }
+
+ src = src.TakeFirst64(srclen)
+
+ // TODO(gvisor.dev/issue/3237): Add cache support:
+ // buffer cache. Ideally we write from src to our buffer cache first.
+ // The slice passed to fs.Write() should be a slice from buffer cache.
+ data := make([]byte, srclen)
+ // Reason for making a copy here: connection.Call() blocks on kerneltask,
+ // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+ // attemp to acquire the mm.activeMu lock as well -> deadlock.
+ // We must finish reading from the userspace memory before
+ // t.Block() deactivates it.
+ cp, err := src.CopyIn(ctx, data)
+ if err != nil {
+ return 0, offset, err
+ }
+ if int64(cp) != srclen {
+ return 0, offset, syserror.EIO
+ }
+
+ n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if n == 0 {
+ // We have checked srclen != 0 previously.
+ // If err == nil, then it's a short write and we return EIO.
+ return 0, offset, syserror.EIO
+ }
+
+ written = int64(n)
+ finalOff = offset + written
+
+ if finalOff > int64(inode.size) {
+ atomic.StoreUint64(&inode.size, uint64(finalOff))
+ atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+ }
+
+ return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+ marshal.StubMarshallable
+
+ // initOut contains the response from the FUSE server.
+ initOut linux.FUSEInitOut
+
+ // initLen is the total length of bytes of the response.
+ initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+ out := &r.initOut
+
+ // Introduced before FUSE kernel version 7.13.
+ out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+
+ // Introduced in FUSE kernel version 7.23.
+ if len(src) >= 4 {
+ out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ }
+ // Introduced in FUSE kernel version 7.28.
+ if len(src) >= 2 {
+ out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ }
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+ return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+ requestEntry
+
+ id linux.FUSEOpID
+ hdr *linux.FUSEHeaderIn
+ data []byte
+
+ // payload for this request: extra bytes to write after
+ // the data slice. Used by FUSE_WRITE.
+ payload []byte
+
+ // If this request is async.
+ async bool
+ // If we don't care its response.
+ // Manually set by the caller.
+ noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+ conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+ hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+ hdr := linux.FUSEHeaderIn{
+ Len: uint32(hdrLen + payload.SizeBytes()),
+ Opcode: opcode,
+ Unique: conn.fd.nextOpID,
+ NodeID: ino,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ PID: pid,
+ }
+
+ buf := make([]byte, hdr.Len)
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ hdr.MarshalBytes(buf[:hdrLen])
+ payload.MarshalBytes(buf[hdrLen:])
+
+ return &Request{
+ id: hdr.Unique,
+ hdr: &hdr,
+ data: buf,
+ }, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+ opcode linux.FUSEOpcode
+ ch chan struct{}
+ hdr *linux.FUSEHeaderOut
+ data []byte
+
+ // If this request is async.
+ async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+ return &futureResponse{
+ opcode: req.hdr.Opcode,
+ ch: make(chan struct{}),
+ async: req.async,
+ }
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+ // Return directly for async requests.
+ if f.async {
+ return nil, nil
+ }
+
+ if err := t.Block(f.ch); err != nil {
+ return nil, err
+ }
+
+ return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+ return &Response{
+ opcode: f.opcode,
+ hdr: *f.hdr,
+ data: f.data,
+ }
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+ opcode linux.FUSEOpcode
+ hdr linux.FUSEHeaderOut
+ data []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+ errno := r.hdr.Error
+ if errno >= 0 {
+ return nil
+ }
+
+ sysErrNo := syscall.Errno(-errno)
+ return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+ return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+ hdrLen := r.hdr.SizeBytes()
+ haveDataLen := r.hdr.Len - uint32(hdrLen)
+ wantDataLen := uint32(m.SizeBytes())
+
+ if haveDataLen < wantDataLen {
+ return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
+ }
+
+ // The response data is empty unless there is some payload. And so, doesn't
+ // need to be unmarshalled.
+ if r.data == nil {
+ return nil
+ }
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ m.UnmarshalBytes(r.data[hdrLen:])
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+ k, err := testutil.Boot()
+ if err != nil {
+ t.Fatalf("Error creating kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+ creds := auth.CredentialsFromContext(ctx)
+
+ k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ AllowUserMount: true,
+ })
+
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+ if err != nil {
+ t.Fatalf("NewMountNamespace(): %v", err)
+ }
+
+ return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+ vfsObj := &vfs.VirtualFilesystem{}
+ fuseDev := &DeviceFD{}
+
+ if err := vfsObj.Init(system.Ctx); err != nil {
+ return nil, nil, err
+ }
+
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef(system.Ctx)
+ if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, nil, err
+ }
+
+ fsopts := filesystemOptions{
+ maxActiveRequests: maxActiveRequests,
+ }
+ fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+ marshal.StubMarshallable
+ data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+ return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+ usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+ *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+ return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+ t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+ t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+ panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+ panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index f5f35a3bc..4c3e9acf8 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -38,6 +38,7 @@ go_library(
"host_named_pipe.go",
"p9file.go",
"regular_file.go",
+ "save_restore.go",
"socket.go",
"special_file.go",
"symlink.go",
@@ -52,8 +53,11 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/hostfd",
"//pkg/sentry/kernel",
@@ -68,6 +72,7 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/unet",
@@ -84,5 +89,6 @@ go_test(
deps = [
"//pkg/p9",
"//pkg/sentry/contexttest",
+ "//pkg/sentry/pgalloc",
],
)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index b98218753..e993c8e36 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -16,16 +16,17 @@ package gofer
import (
"fmt"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -34,8 +35,11 @@ func (d *dentry) isDir() bool {
return d.fileType() == linux.S_IFDIR
}
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
d.IncRef() // reference held by child on its parent
child.parent = d
@@ -46,7 +50,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
d.children[name] = child
}
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
func (d *dentry) cacheNegativeLookupLocked(name string) {
// Don't cache negative lookups if InteropModeShared is in effect (since
// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,49 +85,54 @@ type createSyntheticOpts struct {
// createSyntheticChildLocked creates a synthetic file with the given name
// in d.
//
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
- d2 := &dentry{
+ child := &dentry{
refs: 1, // held by d
fs: d.fs,
+ ino: d.fs.nextIno(),
mode: uint32(opts.mode),
uid: uint32(opts.kuid),
gid: uint32(opts.kgid),
blockSize: usermem.PageSize, // arbitrary
- handle: handle{
- fd: -1,
- },
- nlink: uint32(2),
+ hostFD: -1,
+ nlink: uint32(2),
+ }
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Register(child, "gofer.dentry")
}
switch opts.mode.FileType() {
case linux.S_IFDIR:
// Nothing else needs to be done.
case linux.S_IFSOCK:
- d2.endpoint = opts.endpoint
+ child.endpoint = opts.endpoint
case linux.S_IFIFO:
- d2.pipe = opts.pipe
+ child.pipe = opts.pipe
default:
panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
}
- d2.pf.dentry = d2
- d2.vfsd.Init(d2)
+ child.pf.dentry = child
+ child.vfsd.Init(child)
- d.cacheNewChildLocked(d2, opts.name)
+ d.cacheNewChildLocked(child, opts.name)
d.syntheticChildren++
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
dirents []vfs.Dirent
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(context.Context) {
}
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
@@ -138,6 +149,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.dirents = ds
}
+ d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
if d.cachedMetadataAuthoritative() {
d.touchAtime(fd.vfsfd.Mount())
}
@@ -151,7 +163,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
return nil
}
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
// presence of concurrent mutation of an iterated directory, so
@@ -183,13 +197,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
{
Name: ".",
Type: linux.DT_DIR,
- Ino: d.ino,
+ Ino: uint64(d.ino),
NextOff: 1,
},
{
Name: "..",
Type: uint8(atomic.LoadUint32(&parent.mode) >> 12),
- Ino: parent.ino,
+ Ino: uint64(parent.ino),
NextOff: 2,
},
}
@@ -203,14 +217,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
off := uint64(0)
const count = 64 * 1024 // for consistency with the vfs1 client
d.handleMu.RLock()
- if !d.handleReadable {
+ if d.readFile.isNil() {
// This should not be possible because a readable handle should
// have been opened when the calling directoryFD was opened.
d.handleMu.RUnlock()
panic("gofer.dentry.getDirents called without a readable handle")
}
for {
- p9ds, err := d.handle.file.readdir(ctx, off, count)
+ p9ds, err := d.readFile.readdir(ctx, off, count)
if err != nil {
d.handleMu.RUnlock()
return nil, err
@@ -225,7 +239,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
}
dirent := vfs.Dirent{
Name: p9d.Name,
- Ino: p9d.QID.Path,
+ Ino: d.fs.inoFromQIDPath(p9d.QID.Path),
NextOff: int64(len(dirents) + 1),
}
// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
@@ -258,7 +272,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
dirents = append(dirents, vfs.Dirent{
Name: child.name,
Type: uint8(atomic.LoadUint32(&child.mode) >> 12),
- Ino: child.ino,
+ Ino: uint64(child.ino),
NextOff: int64(len(dirents) + 1),
})
}
@@ -299,3 +313,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
return 0, syserror.EINVAL
}
}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+ return fd.dentry().syncRemoteFile(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 36e0e1856..baecb88c4 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -15,7 +15,9 @@
package gofer
import (
+ "math"
"sync"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -33,7 +35,7 @@ import (
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
- // Snapshot current syncable dentries and special files.
+ // Snapshot current syncable dentries and special file FDs.
fs.syncMu.Lock()
ds := make([]*dentry, 0, len(fs.syncableDentries))
for d := range fs.syncableDentries {
@@ -51,22 +53,28 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// regardless.
var retErr error
- // Sync regular files.
+ // Sync syncable dentries.
for _, d := range ds {
- err := d.syncSharedHandle(ctx)
- d.DecRef()
- if err != nil && retErr == nil {
- retErr = err
+ err := d.syncCachedFile(ctx, true /* forFilesystemSync */)
+ d.DecRef(ctx)
+ if err != nil {
+ ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
+ if retErr == nil {
+ retErr = err
+ }
}
}
// Sync special files, which may be writable but do not use dentry shared
// handles (so they won't be synced by the above).
for _, sffd := range sffds {
- err := sffd.Sync(ctx)
- sffd.vfsfd.DecRef()
- if err != nil && retErr == nil {
- retErr = err
+ err := sffd.sync(ctx, true /* forFilesystemSync */)
+ sffd.vfsfd.DecRef(ctx)
+ if err != nil {
+ ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
+ if retErr == nil {
+ retErr = err
+ }
}
}
@@ -113,12 +121,15 @@ func putDentrySlice(ds *[]*dentry) {
// Dentries which may become cached as a result of the traversal are appended
// to *ds.
//
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+// to date.
//
// Postconditions: The returned dentry's cached metadata is up to date.
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -132,7 +143,7 @@ afterSymlink:
return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
@@ -145,15 +156,13 @@ afterSymlink:
//
// Call rp.CheckMount() before updating d.parent's metadata, since if
// we traverse to another mount then d.parent's metadata is irrelevant.
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, err
}
if d != d.parent && !d.cachedMetadataAuthoritative() {
- _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
- if err != nil {
+ if err := d.parent.updateFromGetattr(ctx); err != nil {
return nil, err
}
- d.parent.updateFromP9Attrs(attrMask, &attr)
}
rp.Advance()
return d.parent, nil
@@ -165,10 +174,10 @@ afterSymlink:
if child == nil {
return nil, syserror.ENOENT
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
- if child.isSymlink() && rp.ShouldFollowSymlink() {
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx, rp.Mount())
if err != nil {
return nil, err
@@ -185,8 +194,11 @@ afterSymlink:
// getChildLocked returns a dentry representing the child of parent with the
// given name. If no such child exists, getChildLocked returns (nil, nil).
//
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
//
// Postconditions: If getChildLocked returns a non-nil dentry, its cached
// metadata is up to date.
@@ -206,20 +218,31 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
}
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+ if child != nil {
+ // Need to lock child.metadataMu because we might be updating child
+ // metadata. We need to hold the lock *before* getting metadata from the
+ // server and release it after updating local metadata.
+ child.metadataMu.Lock()
+ }
qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
if err != nil && err != syserror.ENOENT {
+ if child != nil {
+ child.metadataMu.Unlock()
+ }
return nil, err
}
if child != nil {
- if !file.isNil() && qid.Path == child.ino {
- // The file at this path hasn't changed. Just update cached
- // metadata.
+ if !file.isNil() && qid.Path == child.qidPath {
+ // The file at this path hasn't changed. Just update cached metadata.
file.close(ctx)
- child.updateFromP9Attrs(attrMask, &attr)
+ child.updateFromP9AttrsLocked(attrMask, &attr)
+ child.metadataMu.Unlock()
return child, nil
}
+ child.metadataMu.Unlock()
if file.isNil() && child.isSynthetic() {
// We have a synthetic file, and no remote file has arisen to
// replace it.
@@ -230,7 +253,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// has 0 references, drop it). Wait to update parent.children until we
// know what to replace the existing dentry with (i.e. one of the
// returns below), to avoid a redundant map access.
- vfsObj.InvalidateDentry(&child.vfsd)
+ vfsObj.InvalidateDentry(ctx, &child.vfsd)
if child.isSynthetic() {
// Normally we don't mark invalidated dentries as deleted since
// they may still exist (but at a different path), and also for
@@ -269,13 +292,15 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+// to date.
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, ds)
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -301,7 +326,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
}
for !rp.Done() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, ds)
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -318,12 +343,13 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
// createInRemoteDir (if the parent directory is a real remote directory) or
// createInSyntheticDir (if the parent directory is synthetic) to do so.
//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -371,24 +397,40 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
parent.touchCMtime()
parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
return nil
}
if fs.opts.interop == InteropModeShared {
- // The existence of a dentry at name would be inconclusive because the
- // file it represents may have been deleted from the remote filesystem,
- // so we would need to make an RPC to revalidate the dentry. Just
- // attempt the file creation RPC instead. If a file does exist, the RPC
- // will fail with EEXIST like we would have. If the RPC succeeds, and a
- // stale dentry exists, the dentry will fail revalidation next time
- // it's used.
- return createInRemoteDir(parent, name)
+ if child := parent.children[name]; child != nil && child.isSynthetic() {
+ return syserror.EEXIST
+ }
+ // The existence of a non-synthetic dentry at name would be inconclusive
+ // because the file it represents may have been deleted from the remote
+ // filesystem, so we would need to make an RPC to revalidate the dentry.
+ // Just attempt the file creation RPC instead. If a file does exist, the
+ // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+ // stale dentry exists, the dentry will fail revalidation next time it's
+ // used.
+ if err := createInRemoteDir(parent, name, &ds); err != nil {
+ return err
+ }
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+ return nil
}
if child := parent.children[name]; child != nil {
return syserror.EEXIST
}
// No cached dentry exists; however, there might still be an existing file
// at name. As above, we attempt the file creation RPC anyway.
- if err := createInRemoteDir(parent, name); err != nil {
+ if err := createInRemoteDir(parent, name, &ds); err != nil {
return err
}
if child, ok := parent.children[name]; ok && child == nil {
@@ -397,6 +439,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
parent.touchCMtime()
parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
return nil
}
@@ -404,7 +451,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -440,24 +487,64 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
+
child, ok := parent.children[name]
if ok && child == nil {
return syserror.ENOENT
}
- // We only need a dentry representing the file at name if it can be a mount
- // point. If child is nil, then it can't be a mount point. If child is
- // non-nil but stale, the actual file can't be a mount point either; we
- // detect this case by just speculatively calling PrepareDeleteDentry and
- // only revalidating the dentry if that fails (indicating that the existing
- // dentry is a mount point).
+
+ sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+ if sticky {
+ if !ok {
+ // If the sticky bit is set, we need to retrieve the child to determine
+ // whether removing it is allowed.
+ child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err != nil {
+ return err
+ }
+ } else if child != nil && !child.cachedMetadataAuthoritative() {
+ // Make sure the dentry representing the file at name is up to date
+ // before examining its metadata.
+ child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+ if err != nil {
+ return err
+ }
+ }
+ if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
+ }
+
+ // If a child dentry exists, prepare to delete it. This should fail if it is
+ // a mount point. We detect mount points by speculatively calling
+ // PrepareDeleteDentry, which fails if child is a mount point. However, we
+ // may need to revalidate the file in this case to make sure that it has not
+ // been deleted or replaced on the remote fs, in which case the mount point
+ // will have disappeared. If calling PrepareDeleteDentry fails again on the
+ // up-to-date dentry, we can be sure that it is a mount point.
+ //
+ // Also note that if child is nil, then it can't be a mount point.
if child != nil {
+ // Hold child.dirMu so we can check child.children and
+ // child.syntheticChildren. We don't access these fields until a bit later,
+ // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+ // create an inconsistent lock ordering between dentry.dirMu and
+ // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+ // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+ // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+ // child.dirMu before calling PrepareDeleteDentry.
child.dirMu.Lock()
defer child.dirMu.Unlock()
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
- if parent.cachedMetadataAuthoritative() {
+ // We can skip revalidation in several cases:
+ // - We are not in InteropModeShared
+ // - The parent directory is synthetic, in which case the child must also
+ // be synthetic
+ // - We already updated the child during the sticky bit check above
+ if parent.cachedMetadataAuthoritative() || sticky {
return err
}
child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -518,7 +605,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
if child == nil {
return syserror.ENOENT
}
- } else {
+ } else if child == nil || !child.isSynthetic() {
err = parent.file.unlinkAt(ctx, name, flags)
if err != nil {
if child != nil {
@@ -527,8 +614,20 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
return err
}
}
+
+ // Generate inotify events for rmdir or unlink.
+ if dir {
+ parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+ } else {
+ var cw *vfs.Watches
+ if child != nil {
+ cw = &child.watches
+ }
+ vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
+ }
+
if child != nil {
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
child.setDeleted()
if child.isSynthetic() {
parent.syntheticChildren--
@@ -555,7 +654,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
// but dentry slices are allocated lazily, and it's much easier to say "defer
// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
-func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
fs.renameMu.RUnlock()
if *ds == nil {
return
@@ -563,20 +662,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
if len(**ds) != 0 {
fs.renameMu.Lock()
for _, d := range **ds {
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
}
fs.renameMu.Unlock()
}
putDentrySlice(*ds)
}
-func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
if *ds == nil {
fs.renameMu.Unlock()
return
}
for _, d := range **ds {
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
}
fs.renameMu.Unlock()
putDentrySlice(*ds)
@@ -586,7 +685,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
@@ -598,7 +697,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -619,7 +718,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -638,19 +737,40 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
- // 9P2000.L supports hard links, but we don't.
- return syserror.EPERM
+ d := vd.Dentry().Impl().(*dentry)
+ if d.isDir() {
+ return syserror.EPERM
+ }
+ gid := auth.KGID(atomic.LoadUint32(&d.gid))
+ uid := auth.KUID(atomic.LoadUint32(&d.uid))
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
+ return err
+ }
+ if d.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.nlink == math.MaxUint32 {
+ return syserror.EMLINK
+ }
+ if err := parent.file.link(ctx, d.file, childName); err != nil {
+ return err
+ }
+
+ // Success!
+ atomic.AddUint32(&d.nlink, 1)
+ return nil
}, nil)
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
creds := rp.Credentials()
- return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
return err
@@ -685,34 +805,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
creds := rp.Credentials()
_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
- // If the gofer does not allow creating a socket or pipe, create a
- // synthetic one, i.e. one that is kept entirely in memory.
- if err == syserror.EPERM {
- switch opts.Mode.FileType() {
- case linux.S_IFSOCK:
- parent.createSyntheticChildLocked(&createSyntheticOpts{
- name: name,
- mode: opts.Mode,
- kuid: creds.EffectiveKUID,
- kgid: creds.EffectiveKGID,
- endpoint: opts.Endpoint,
- })
- return nil
- case linux.S_IFIFO:
- parent.createSyntheticChildLocked(&createSyntheticOpts{
- name: name,
- mode: opts.Mode,
- kuid: creds.EffectiveKUID,
- kgid: creds.EffectiveKGID,
- pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
- })
- return nil
- }
+ if err != syserror.EPERM {
+ return err
}
- return err
+
+ // EPERM means that gofer does not allow creating a socket or pipe. Fallback
+ // to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+ // Check that we're not overriding an existing file with a synthetic one.
+ _, err = fs.stepLocked(ctx, rp, parent, true, ds)
+ switch {
+ case err == nil:
+ // Step succeeded, another file exists.
+ return syserror.EEXIST
+ case err != syserror.ENOENT:
+ // Unexpected error.
+ return err
+ }
+
+ switch opts.Mode.FileType() {
+ case linux.S_IFSOCK:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ endpoint: opts.Endpoint,
+ })
+ return nil
+ case linux.S_IFIFO:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+ })
+ return nil
+ }
+ // Retain error from gofer if synthetic file cannot be created internally.
+ return syserror.EPERM
}, nil)
}
@@ -730,7 +865,14 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ unlocked = true
+ }
+ }
+ defer unlock()
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
@@ -740,7 +882,17 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
}
if rp.Done() {
- return start.openLocked(ctx, rp, &opts)
+ // Reject attempts to open mount root directory with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
+ return start.open(ctx, rp, &opts)
}
afterTrailingSymlink:
@@ -752,9 +904,13 @@ afterTrailingSymlink:
if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
- child, err := fs.stepLocked(ctx, rp, parent, &ds)
+ child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
if err == syserror.ENOENT && mayCreate {
if parent.isSynthetic() {
parent.dirMu.Unlock()
@@ -764,15 +920,14 @@ afterTrailingSymlink:
parent.dirMu.Unlock()
return fd, err
}
+ parent.dirMu.Unlock()
if err != nil {
- parent.dirMu.Unlock()
return nil, err
}
- // Open existing child or follow symlink.
- parent.dirMu.Unlock()
if mustCreate {
return nil, syserror.EEXIST
}
+ // Open existing child or follow symlink.
if child.isSymlink() && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx, rp.Mount())
if err != nil {
@@ -784,29 +939,48 @@ afterTrailingSymlink:
start = parent
goto afterTrailingSymlink
}
- return child.openLocked(ctx, rp, &opts)
+ if rp.MustBeDir() && !child.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
+ return child.open(ctx, rp, &opts)
}
-// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
+
+ trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+ if trunc {
+ // Lock metadataMu *while* we open a regular file with O_TRUNC because
+ // open(2) will change the file size on server.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ }
+
+ var vfd *vfs.FileDescription
+ var err error
mnt := rp.Mount()
switch d.fileType() {
case linux.S_IFREG:
if !d.fs.opts.regularFilesUseSpecialFileFD {
- if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
return nil, err
}
fd := &regularFileFD{}
+ fd.LockFD.Init(&d.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
AllowDirectIO: true,
}); err != nil {
return nil, err
}
- return &fd.vfsfd, nil
+ vfd = &fd.vfsfd
}
case linux.S_IFDIR:
// Can't open directories with O_CREAT.
@@ -826,6 +1000,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
}
}
fd := &directoryFD{}
+ fd.LockFD.Init(&d.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -838,17 +1013,35 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return nil, syserror.ENXIO
}
if d.fs.iopts.OpenSocketsByConnecting {
- return d.connectSocketLocked(ctx, opts)
+ return d.openSocketByConnecting(ctx, opts)
}
case linux.S_IFIFO:
if d.isSynthetic() {
- return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
+ return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+ }
+ }
+
+ if vfd == nil {
+ if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil {
+ return nil, err
}
}
- return d.openSpecialFileLocked(ctx, mnt, opts)
+
+ if trunc {
+ // If no errors occured so far then update file size in memory. This
+ // step is required even if !d.cachedMetadataAuthoritative() because
+ // d.mappings has to be updated.
+ // d.metadataMu has already been acquired if trunc == true.
+ d.updateSizeLocked(0)
+
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ }
+ return vfd, err
}
-func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
@@ -868,7 +1061,7 @@ func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions)
return fd, nil
}
-func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
@@ -902,7 +1095,7 @@ retry:
return nil, err
}
}
- fd, err := newSpecialFileFD(h, mnt, d, opts.Flags)
+ fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
if err != nil {
h.close(ctx)
return nil, err
@@ -910,8 +1103,10 @@ retry:
return &fd.vfsfd, nil
}
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
@@ -934,10 +1129,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
creds := rp.Credentials()
name := rp.Component()
- // Filter file creation flags and O_LARGEFILE out; the create RPC already
- // has the semantics of O_CREAT|O_EXCL, while some servers will choke on
- // O_LARGEFILE.
- createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+ // We only want the access mode for creating the file.
+ createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask
fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
if err != nil {
dirfile.close(ctx)
@@ -970,12 +1163,18 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
child.handleMu.Lock()
- child.handle.file = openFile
- if fdobj != nil {
- child.handle.fd = int32(fdobj.Release())
+ if vfs.MayReadFileWithOpenFlags(opts.Flags) {
+ child.readFile = openFile
+ if fdobj != nil {
+ child.hostFD = int32(fdobj.Release())
+ }
+ } else if fdobj != nil {
+ // Can't use fdobj if it's not readable.
+ fdobj.Close()
+ }
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
+ child.writeFile = openFile
}
- child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
- child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
child.handleMu.Unlock()
}
// Insert the dentry into the tree.
@@ -989,6 +1188,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
var childVFSFD *vfs.FileDescription
if useRegularFileFD {
fd := &regularFileFD{}
+ fd.LockFD.Init(&child.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
AllowDirectIO: true,
}); err != nil {
@@ -1003,13 +1203,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
if fdobj != nil {
h.fd = int32(fdobj.Release())
}
- fd, err := newSpecialFileFD(h, mnt, child, opts.Flags)
+ fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
if err != nil {
h.close(ctx)
return nil, err
}
childVFSFD = &fd.vfsfd
}
+ d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
return childVFSFD, nil
}
@@ -1017,7 +1218,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
@@ -1037,7 +1238,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
var ds *[]*dentry
fs.renameMu.Lock()
- defer fs.renameMuUnlockAndCheckCaching(&ds)
+ defer fs.renameMuUnlockAndCheckCaching(ctx, &ds)
newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
if err != nil {
return err
@@ -1061,7 +1262,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return err
}
}
- if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
vfsObj := rp.VirtualFilesystem()
@@ -1076,12 +1278,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if renamed == nil {
return syserror.ENOENT
}
+ if err := oldParent.mayDelete(creds, renamed); err != nil {
+ return err
+ }
if renamed.isDir() {
if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
return syserror.EINVAL
}
if oldParent != newParent {
- if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
return err
}
}
@@ -1092,7 +1297,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
if oldParent != newParent {
- if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
newParent.dirMu.Lock()
@@ -1112,6 +1317,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !renamed.isDir() {
return syserror.EISDIR
}
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
} else {
if rp.MustBeDir() || renamed.isDir() {
return syserror.ENOTDIR
@@ -1123,7 +1331,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return nil
}
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
return err
}
@@ -1148,7 +1356,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
// Update the dentry tree.
- vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
if replaced != nil {
replaced.setDeleted()
if replaced.isSynthetic() {
@@ -1162,14 +1370,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// with reference counts and queue oldParent for checkCachingLocked if the
// parent isn't actually changing.
if oldParent != newParent {
+ oldParent.decRefLocked()
ds = appendDentry(ds, oldParent)
newParent.IncRef()
if renamed.isSynthetic() {
oldParent.syntheticChildren--
newParent.syntheticChildren++
}
+ renamed.parent = newParent
}
- renamed.parent = newParent
renamed.name = newName
if newParent.children == nil {
newParent.children = make(map[string]*dentry)
@@ -1190,10 +1399,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if newParent.cachedMetadataAuthoritative() {
newParent.dirents = nil
newParent.touchCMtime()
- if renamed.isDir() {
+ if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+ // Increase the link count if we did not replace another directory.
newParent.incLinks()
}
}
+ vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
return nil
}
@@ -1206,19 +1417,28 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+ err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statx{}, err
@@ -1235,7 +1455,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statfs{}, err
@@ -1269,7 +1489,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
creds := rp.Credentials()
_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
return err
@@ -1281,11 +1501,11 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return fs.unlinkAt(ctx, rp, false /* dir */)
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -1298,61 +1518,76 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
d.IncRef()
return &endpoint{
dentry: d,
- file: d.file.file,
path: opts.Addr,
}, nil
}
- return d.endpoint, nil
+ if d.endpoint != nil {
+ return d.endpoint, nil
+ }
}
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
}
- return d.listxattr(ctx, rp.Credentials(), size)
+ return d.listXattr(ctx, rp.Credentials(), size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
}
- return d.getxattr(ctx, rp.Credentials(), &opts)
+ return d.getXattr(ctx, rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ err = d.setXattr(ctx, rp.Credentials(), &opts)
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
return err
}
- return d.setxattr(ctx, rp.Credentials(), &opts)
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- return d.removexattr(ctx, rp.Credentials(), name)
+ err = d.removeXattr(ctx, rp.Credentials(), name)
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 3f3bd56f0..80668ebc1 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -26,6 +26,9 @@
// *** "memmap.Mappable locks taken by Translate" below this point
// dentry.handleMu
// dentry.dataMu
+// filesystem.inoMu
+// specialFileFD.mu
+// specialFileFD.bufMu
//
// Locking dentry.dirMu in multiple dentries requires that either ancestor
// dentries are locked before descendant dentries, or that filesystem.renameMu
@@ -36,7 +39,6 @@ import (
"fmt"
"strconv"
"strings"
- "sync"
"sync/atomic"
"syscall"
@@ -44,7 +46,10 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
+ refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -52,6 +57,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/usermem"
@@ -61,9 +67,13 @@ import (
const Name = "9p"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -76,7 +86,7 @@ type filesystem struct {
iopts InternalFilesystemOptions
// client is the client used by this filesystem. client is immutable.
- client *p9.Client
+ client *p9.Client `state:"nosave"`
// clock is a realtime clock used to set timestamps in file operations.
clock ktime.Clock
@@ -84,6 +94,9 @@ type filesystem struct {
// devMinor is the filesystem's minor device number. devMinor is immutable.
devMinor uint32
+ // root is the root dentry. root is immutable.
+ root *dentry
+
// renameMu serves two purposes:
//
// - It synchronizes path resolution with renaming initiated by this
@@ -94,23 +107,42 @@ type filesystem struct {
// reference count (such that it is usable as vfs.ResolvingPath.Start() or
// is reachable from its children), or if it is a child dentry (such that
// it is reachable from its parent).
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
// cachedDentries contains all dentries with 0 references. (Due to race
// conditions, it may also contain dentries with non-zero references.)
- // cachedDentriesLen is the number of dentries in cachedDentries. These
- // fields are protected by renameMu.
+ // cachedDentriesLen is the number of dentries in cachedDentries. These fields
+ // are protected by renameMu.
cachedDentries dentryList
cachedDentriesLen uint64
- // syncableDentries contains all dentries in this filesystem for which
- // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
- // These fields are protected by syncMu.
- syncMu sync.Mutex
+ // syncableDentries contains all non-synthetic dentries. specialFileFDs
+ // contains all open specialFileFDs. These fields are protected by syncMu.
+ syncMu sync.Mutex `state:"nosave"`
syncableDentries map[*dentry]struct{}
specialFileFDs map[*specialFileFD]struct{}
+
+ // inoByQIDPath maps previously-observed QID.Paths to inode numbers
+ // assigned to those paths. inoByQIDPath is not preserved across
+ // checkpoint/restore because QIDs may be reused between different gofer
+ // processes, so QIDs may be repeated for different files across
+ // checkpoint/restore. inoByQIDPath is protected by inoMu.
+ inoMu sync.Mutex `state:"nosave"`
+ inoByQIDPath map[uint64]uint64 `state:"nosave"`
+
+ // lastIno is the last inode number assigned to a file. lastIno is accessed
+ // using atomic memory operations.
+ lastIno uint64
+
+ // savedDentryRW records open read/write handles during save/restore.
+ savedDentryRW map[*dentry]savedDentryRW
+
+ // released is nonzero once filesystem.Release has been called. It is accessed
+ // with atomic memory operations.
+ released int32
}
+// +stateify savable
type filesystemOptions struct {
// "Standard" 9P options.
fd int
@@ -121,8 +153,7 @@ type filesystemOptions struct {
msize uint32
version string
- // maxCachedDentries is the maximum number of dentries with 0 references
- // retained by the client.
+ // maxCachedDentries is the maximum size of filesystem.cachedDentries.
maxCachedDentries uint64
// If forcePageCache is true, host FDs may not be used for application
@@ -156,6 +187,8 @@ type filesystemOptions struct {
// InteropMode controls the client's interaction with other remote filesystem
// users.
+//
+// +stateify savable
type InteropMode uint32
const (
@@ -171,10 +204,10 @@ const (
//
// - File timestamps are based on client clocks. This ensures that users of
// the client observe timestamps that are coherent with their own clocks
- // and consistent with Linux's semantics. However, since it is not always
- // possible for clients to set arbitrary atimes and mtimes, and never
- // possible for clients to set arbitrary ctimes, file timestamp changes are
- // stored in the client only and never sent to the remote filesystem.
+ // and consistent with Linux's semantics (in particular, it is not always
+ // possible for clients to set arbitrary atimes and mtimes depending on the
+ // remote filesystem implementation, and never possible for clients to set
+ // arbitrary ctimes.)
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -214,7 +247,13 @@ const (
// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
type InternalFilesystemOptions struct {
+ // If UniqueID is non-empty, it is an opaque string used to reassociate the
+ // filesystem with a new server FD during restoration from checkpoint.
+ UniqueID string
+
// If LeakConnection is true, do not close the connection to the server
// when the Filesystem is released. This is necessary for deployments in
// which servers can handle only a single client and report failure if that
@@ -240,6 +279,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -251,46 +293,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
mopts := vfs.GenericParseMountOptions(opts.Data)
var fsopts filesystemOptions
- // Check that the transport is "fd".
- trans, ok := mopts["trans"]
- if !ok {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
- return nil, nil, syserror.EINVAL
- }
- delete(mopts, "trans")
- if trans != "fd" {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
- return nil, nil, syserror.EINVAL
- }
-
- // Check that read and write FDs are provided and identical.
- rfdstr, ok := mopts["rfdno"]
- if !ok {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
- return nil, nil, syserror.EINVAL
- }
- delete(mopts, "rfdno")
- rfd, err := strconv.Atoi(rfdstr)
+ fd, err := getFDFromMountOptionsMap(ctx, mopts)
if err != nil {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
- return nil, nil, syserror.EINVAL
- }
- wfdstr, ok := mopts["wfdno"]
- if !ok {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
- return nil, nil, syserror.EINVAL
- }
- delete(mopts, "wfdno")
- wfd, err := strconv.Atoi(wfdstr)
- if err != nil {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
- return nil, nil, syserror.EINVAL
- }
- if rfd != wfd {
- ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
- return nil, nil, syserror.EINVAL
+ return nil, nil, err
}
- fsopts.fd = rfd
+ fsopts.fd = fd
// Get the attach name.
fsopts.aname = "/"
@@ -406,85 +413,133 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
// If !ok, iopts being the zero value is correct.
- // Establish a connection with the server.
- conn, err := unet.NewSocket(fsopts.fd)
+ // Construct the filesystem object.
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
if err != nil {
return nil, nil, err
}
+ fs := &filesystem{
+ mfp: mfp,
+ opts: fsopts,
+ iopts: iopts,
+ clock: ktime.RealtimeClockFromContext(ctx),
+ devMinor: devMinor,
+ syncableDentries: make(map[*dentry]struct{}),
+ specialFileFDs: make(map[*specialFileFD]struct{}),
+ inoByQIDPath: make(map[uint64]uint64),
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
- // Perform version negotiation with the server.
- ctx.UninterruptibleSleepStart(false)
- client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
- ctx.UninterruptibleSleepFinish(false)
- if err != nil {
- conn.Close()
+ // Connect to the server.
+ if err := fs.dial(ctx); err != nil {
return nil, nil, err
}
- // Ownership of conn has been transferred to client.
// Perform attach to obtain the filesystem root.
ctx.UninterruptibleSleepStart(false)
- attached, err := client.Attach(fsopts.aname)
+ attached, err := fs.client.Attach(fsopts.aname)
ctx.UninterruptibleSleepFinish(false)
if err != nil {
- client.Close()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
attachFile := p9file{attached}
qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
if err != nil {
attachFile.close(ctx)
- client.Close()
- return nil, nil, err
- }
-
- // Construct the filesystem object.
- devMinor, err := vfsObj.GetAnonBlockDevMinor()
- if err != nil {
- attachFile.close(ctx)
- client.Close()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
- fs := &filesystem{
- mfp: mfp,
- opts: fsopts,
- iopts: iopts,
- client: client,
- clock: ktime.RealtimeClockFromContext(ctx),
- devMinor: devMinor,
- syncableDentries: make(map[*dentry]struct{}),
- specialFileFDs: make(map[*specialFileFD]struct{}),
- }
- fs.vfsfs.Init(vfsObj, &fstype, fs)
// Construct the root dentry.
root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
if err != nil {
attachFile.close(ctx)
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
// Set the root's reference count to 2. One reference is returned to the
- // caller, and the other is deliberately leaked to prevent the root from
- // being "cached" and subsequently evicted. Its resources will still be
- // cleaned up by fs.Release().
+ // caller, and the other is held by fs to prevent the root from being "cached"
+ // and subsequently evicted.
root.refs = 2
+ fs.root = root
return &fs.vfsfs, &root.vfsd, nil
}
+func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
+ // Check that the transport is "fd".
+ trans, ok := mopts["trans"]
+ if !ok || trans != "fd" {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as 'trans=fd'")
+ return -1, syserror.EINVAL
+ }
+ delete(mopts, "trans")
+
+ // Check that read and write FDs are provided and identical.
+ rfdstr, ok := mopts["rfdno"]
+ if !ok {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as 'rfdno=<file descriptor>'")
+ return -1, syserror.EINVAL
+ }
+ delete(mopts, "rfdno")
+ rfd, err := strconv.Atoi(rfdstr)
+ if err != nil {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: rfdno=%s", rfdstr)
+ return -1, syserror.EINVAL
+ }
+ wfdstr, ok := mopts["wfdno"]
+ if !ok {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as 'wfdno=<file descriptor>'")
+ return -1, syserror.EINVAL
+ }
+ delete(mopts, "wfdno")
+ wfd, err := strconv.Atoi(wfdstr)
+ if err != nil {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: wfdno=%s", wfdstr)
+ return -1, syserror.EINVAL
+ }
+ if rfd != wfd {
+ ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+ return -1, syserror.EINVAL
+ }
+ return rfd, nil
+}
+
+// Preconditions: fs.client == nil.
+func (fs *filesystem) dial(ctx context.Context) error {
+ // Establish a connection with the server.
+ conn, err := unet.NewSocket(fs.opts.fd)
+ if err != nil {
+ return err
+ }
+
+ // Perform version negotiation with the server.
+ ctx.UninterruptibleSleepStart(false)
+ client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version)
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ conn.Close()
+ return err
+ }
+ // Ownership of conn has been transferred to client.
+
+ fs.client = client
+ return nil
+}
+
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
- ctx := context.Background()
- mf := fs.mfp.MemoryFile()
+func (fs *filesystem) Release(ctx context.Context) {
+ atomic.StoreInt32(&fs.released, 1)
+ mf := fs.mfp.MemoryFile()
fs.syncMu.Lock()
for d := range fs.syncableDentries {
d.handleMu.Lock()
d.dataMu.Lock()
- if d.handleWritable {
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty cached data to the remote file.
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
}
// TODO(jamieliu): Do we need to flushf/fsync d?
@@ -494,9 +549,9 @@ func (fs *filesystem) Release() {
d.dirty.RemoveAll()
d.dataMu.Unlock()
// Close the host fd if one exists.
- if d.handle.fd >= 0 {
- syscall.Close(int(d.handle.fd))
- d.handle.fd = -1
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
}
@@ -505,6 +560,21 @@ func (fs *filesystem) Release() {
// fs.
fs.syncMu.Unlock()
+ // If leak checking is enabled, release all outstanding references in the
+ // filesystem. We deliberately avoid doing this outside of leak checking; we
+ // have released all external resources above rather than relying on dentry
+ // destructors.
+ if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
+ fs.renameMu.Lock()
+ fs.root.releaseSyntheticRecursiveLocked(ctx)
+ fs.evictAllCachedDentriesLocked(ctx)
+ fs.renameMu.Unlock()
+
+ // An extra reference was held by the filesystem on the root to prevent it from
+ // being cached/evicted.
+ fs.root.DecRef(ctx)
+ }
+
if !fs.iopts.LeakConnection {
// Close the connection to the server. This implicitly clunks all fids.
fs.client.Close()
@@ -513,7 +583,34 @@ func (fs *filesystem) Release() {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}
+// releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
+// the reference count on every synthetic dentry. Synthetic dentries have one
+// reference for existence that should be dropped during filesystem.Release.
+//
+// Precondition: d.fs.renameMu is locked.
+func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
+ if d.isSynthetic() {
+ d.decRefLocked()
+ d.checkCachingLocked(ctx)
+ }
+ if d.isDir() {
+ var children []*dentry
+ d.dirMu.Lock()
+ for _, child := range d.children {
+ children = append(children, child)
+ }
+ d.dirMu.Unlock()
+ for _, child := range children {
+ if child != nil {
+ child.releaseSyntheticRecursiveLocked(ctx)
+ }
+ }
+ }
+}
+
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -538,14 +635,15 @@ type dentry struct {
// filesystem.renameMu.
name string
- // We don't support hard links, so each dentry maps 1:1 to an inode.
+ // qidPath is the p9.QID.Path for this file. qidPath is immutable.
+ qidPath uint64
// file is the unopened p9.File that backs this dentry. file is immutable.
//
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
// that does not exist on the remote filesystem. As of this writing, the
// only files that can be synthetic are sockets, pipes, and directories.
- file p9file
+ file p9file `state:"nosave"`
// If deleted is non-zero, the file represented by this dentry has been
// deleted. deleted is accessed using atomic memory operations.
@@ -557,7 +655,7 @@ type dentry struct {
cached bool
dentryEntry
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
// If this dentry represents a directory, children contains:
//
@@ -581,60 +679,75 @@ type dentry struct {
// returned by the server. dirents is protected by dirMu.
dirents []vfs.Dirent
- // Cached metadata; protected by metadataMu and accessed using atomic
- // memory operations unless otherwise specified.
- metadataMu sync.Mutex
- ino uint64 // immutable
- mode uint32 // type is immutable, perms are mutable
- uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
- gid uint32 // auth.KGID, but ...
- blockSize uint32 // 0 if unknown
+ // Cached metadata; protected by metadataMu.
+ // To access:
+ // - In situations where consistency is not required (like stat), these
+ // can be accessed using atomic operations only (without locking).
+ // - Lock metadataMu and can access without atomic operations.
+ // To mutate:
+ // - Lock metadataMu and use atomic operations to update because we might
+ // have atomic readers that don't hold the lock.
+ metadataMu sync.Mutex `state:"nosave"`
+ ino uint64 // immutable
+ mode uint32 // type is immutable, perms are mutable
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ blockSize uint32 // 0 if unknown
// Timestamps, all nsecs from the Unix epoch.
atime int64
mtime int64
ctime int64
btime int64
- // File size, protected by both metadataMu and dataMu (i.e. both must be
- // locked to mutate it).
+ // File size, which differs from other metadata in two ways:
+ //
+ // - We make a best-effort attempt to keep it up to date even if
+ // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
+ //
+ // - size is protected by both metadataMu and dataMu (i.e. both must be
+ // locked to mutate it; locking either is sufficient to access it).
size uint64
+ // If this dentry does not represent a synthetic file, deleted is 0, and
+ // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
+ // remote file's timestamps, which should be updated when this dentry is
+ // evicted.
+ atimeDirty uint32
+ mtimeDirty uint32
// nlink counts the number of hard links to this dentry. It's updated and
// accessed using atomic operations. It's not protected by metadataMu like the
// other metadata fields.
nlink uint32
- mapsMu sync.Mutex
+ mapsMu sync.Mutex `state:"nosave"`
// If this dentry represents a regular file, mappings tracks mappings of
// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
mappings memmap.MappingSet
- // If this dentry represents a regular file or directory:
+ // - If this dentry represents a regular file or directory, readFile is the
+ // p9.File used for reads by all regularFileFDs/directoryFDs representing
+ // this dentry.
//
- // - handle is the I/O handle used by all regularFileFDs/directoryFDs
- // representing this dentry.
+ // - If this dentry represents a regular file, writeFile is the p9.File
+ // used for writes by all regularFileFDs representing this dentry.
//
- // - handleReadable is true if handle is readable.
- //
- // - handleWritable is true if handle is writable.
- //
- // Invariants:
- //
- // - If handleReadable == handleWritable == false, then handle.file == nil
- // (i.e. there is no open handle). Conversely, if handleReadable ||
- // handleWritable == true, then handle.file != nil (i.e. there is an open
- // handle).
- //
- // - handleReadable and handleWritable cannot transition from true to false
- // (i.e. handles may not be downgraded).
+ // - If this dentry represents a regular file, hostFD is the host FD used
+ // for memory mappings and I/O (when applicable) in preference to readFile
+ // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
+ // also be writable. If hostFD is -1, no such host FD is available.
//
// These fields are protected by handleMu.
- handleMu sync.RWMutex
- handle handle
- handleReadable bool
- handleWritable bool
+ //
+ // readFile and writeFile may or may not represent the same p9.File. Once
+ // either p9.File transitions from closed (isNil() == true) to open
+ // (isNil() == false), it may be mutated with handleMu locked, but cannot
+ // be closed until the dentry is destroyed.
+ handleMu sync.RWMutex `state:"nosave"`
+ readFile p9file `state:"nosave"`
+ writeFile p9file `state:"nosave"`
+ hostFD int32 `state:"nosave"`
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// If this dentry represents a regular file that is client-cached, cache
// maps offsets into the cached file to offsets into
@@ -646,7 +759,7 @@ type dentry struct {
// tracks dirty segments in cache. dirty is protected by dataMu.
dirty fsutil.DirtySet
- // pf implements platform.File for mappings of handle.fd.
+ // pf implements platform.File for mappings of hostFD.
pf dentryPlatformFile
// If this dentry represents a symbolic link, InteropModeShared is not in
@@ -662,6 +775,18 @@ type dentry struct {
// If this dentry represents a synthetic named pipe, pipe is the pipe
// endpoint bound to this file.
pipe *pipe.VFSPipe
+
+ locks vfs.FileLocks
+
+ // Inotify watches for this dentry.
+ //
+ // Note that inotify may behave unexpectedly in the presence of hard links,
+ // because dentries corresponding to the same file have separate inotify
+ // watches when they should share the same set. This is the case because it is
+ // impossible for us to know for sure whether two dentries correspond to the
+ // same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+ // a more in-depth discussion on this matter).
+ watches vfs.Watches
}
// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -697,15 +822,14 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
d := &dentry{
fs: fs,
+ qidPath: qid.Path,
file: file,
- ino: qid.Path,
+ ino: fs.inoFromQIDPath(qid.Path),
mode: uint32(attr.Mode),
uid: uint32(fs.opts.dfltuid),
gid: uint32(fs.opts.dfltgid),
blockSize: usermem.PageSize,
- handle: handle{
- fd: -1,
- },
+ hostFD: -1,
}
d.pf.dentry = d
if mask.UID {
@@ -736,6 +860,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
d.nlink = uint32(attr.NLink)
}
d.vfsd.Init(d)
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Register(d, "gofer.dentry")
+ }
fs.syncMu.Lock()
fs.syncableDentries[d] = struct{}{}
@@ -743,6 +870,21 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
return d, nil
}
+func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
+ fs.inoMu.Lock()
+ defer fs.inoMu.Unlock()
+ if ino, ok := fs.inoByQIDPath[qidPath]; ok {
+ return ino
+ }
+ ino := fs.nextIno()
+ fs.inoByQIDPath[qidPath] = ino
+ return ino
+}
+
+func (fs *filesystem) nextIno() uint64 {
+ return atomic.AddUint64(&fs.lastIno, 1)
+}
+
func (d *dentry) isSynthetic() bool {
return d.file.isNil()
}
@@ -753,8 +895,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool {
// updateFromP9Attrs is called to update d's metadata after an update from the
// remote filesystem.
-func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
- d.metadataMu.Lock()
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
if mask.Mode {
if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
d.metadataMu.Unlock()
@@ -772,10 +914,12 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
if attr.BlockSize != 0 {
atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
}
- if mask.ATime {
+ // Don't override newer client-defined timestamps with old server-defined
+ // ones.
+ if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
}
- if mask.MTime {
+ if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
}
if mask.CTime {
@@ -788,25 +932,33 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
}
if mask.Size {
- d.dataMu.Lock()
- atomic.StoreUint64(&d.size, attr.Size)
- d.dataMu.Unlock()
+ d.updateSizeLocked(attr.Size)
}
- d.metadataMu.Unlock()
}
-// Preconditions: !d.isSynthetic()
+// Preconditions: !d.isSynthetic().
func (d *dentry) updateFromGetattr(ctx context.Context) error {
- // Use d.handle.file, which represents a 9P fid that has been opened, in
- // preference to d.file, which represents a 9P fid that has not. This may
- // be significantly more efficient in some implementations.
+ // Use d.readFile or d.writeFile, which represent 9P fids that have been
+ // opened, in preference to d.file, which represents a 9P fid that has not.
+ // This may be significantly more efficient in some implementations. Prefer
+ // d.writeFile over d.readFile since some filesystem implementations may
+ // update a writable handle's metadata after writes to that handle, without
+ // making metadata updates immediately visible to read-only handles
+ // representing the same file.
var (
file p9file
handleMuRLocked bool
)
+ // d.metadataMu must be locked *before* we getAttr so that we do not end up
+ // updating stale attributes in d.updateFromP9AttrsLocked().
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
d.handleMu.RLock()
- if !d.handle.file.isNil() {
- file = d.handle.file
+ if !d.writeFile.isNil() {
+ file = d.writeFile
+ handleMuRLocked = true
+ } else if !d.readFile.isNil() {
+ file = d.readFile
handleMuRLocked = true
} else {
file = d.file
@@ -819,7 +971,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
if err != nil {
return err
}
- d.updateFromP9Attrs(attrMask, &attr)
+ d.updateFromP9AttrsLocked(attrMask, &attr)
return nil
}
@@ -831,23 +983,32 @@ func (d *dentry) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
stat.Blksize = atomic.LoadUint32(&d.blockSize)
stat.Nlink = atomic.LoadUint32(&d.nlink)
+ if stat.Nlink == 0 {
+ // The remote filesystem doesn't support link count; just make
+ // something up. This is consistent with Linux, where
+ // fs/inode.c:inode_init_always() initializes link count to 1, and
+ // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+ // it's not provided by the remote filesystem.
+ stat.Nlink = 1
+ }
stat.UID = atomic.LoadUint32(&d.uid)
stat.GID = atomic.LoadUint32(&d.gid)
stat.Mode = uint16(atomic.LoadUint32(&d.mode))
- stat.Ino = d.ino
+ stat.Ino = uint64(d.ino)
stat.Size = atomic.LoadUint64(&d.size)
// This is consistent with regularFileFD.Seek(), which treats regular files
// as having no holes.
stat.Blocks = (stat.Size + 511) / 512
- stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
- stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
- stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
- stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+ stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime))
+ stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime))
+ stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime))
+ stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime))
stat.DevMajor = linux.UNNAMED_MAJOR
stat.DevMinor = d.fs.devMinor
}
-func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+ stat := &opts.Stat
if stat.Mask == 0 {
return nil
}
@@ -855,37 +1016,47 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
return syserror.EPERM
}
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
- if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
return err
}
if err := mnt.CheckBeginWrite(); err != nil {
return err
}
defer mnt.EndWrite()
- setLocalAtime := false
- setLocalMtime := false
+
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // Reject attempts to truncate files other than regular files, since
+ // filesystem implementations may return the wrong errno.
+ switch mode.FileType() {
+ case linux.S_IFREG:
+ // ok
+ case linux.S_IFDIR:
+ return syserror.EISDIR
+ default:
+ return syserror.EINVAL
+ }
+ }
+
+ var now int64
if d.cachedMetadataAuthoritative() {
- // Timestamp updates will be handled locally.
- setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
- setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
- stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
-
- // Prepare for truncate.
- if stat.Mask&linux.STATX_SIZE != 0 {
- switch d.mode & linux.S_IFMT {
- case linux.S_IFREG:
- if !setLocalMtime {
- // Truncate updates mtime.
- setLocalMtime = true
- stat.Mtime.Nsec = linux.UTIME_NOW
- }
- case linux.S_IFDIR:
- return syserror.EISDIR
- default:
- return syserror.EINVAL
+ // Truncate updates mtime.
+ if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
+ stat.Mask |= linux.STATX_MTIME
+ stat.Mtime = linux.StatxTimestamp{
+ Nsec: linux.UTIME_NOW,
}
}
+
+ // Use client clocks for timestamps.
+ now = d.fs.clock.Now().Nanoseconds()
+ if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
+ stat.Atime = linux.NsecToStatxTimestamp(now)
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
+ stat.Mtime = linux.NsecToStatxTimestamp(now)
+ }
}
+
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
if !d.isSynthetic() {
@@ -911,6 +1082,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
}); err != nil {
return err
}
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // d.size should be kept up to date, and privatized
+ // copy-on-write mappings of truncated pages need to be
+ // invalidated, even if InteropModeShared is in effect.
+ d.updateSizeLocked(stat.Size)
+ }
}
if d.fs.opts.interop == InteropModeShared {
// There's no point to updating d's metadata in this case since
@@ -920,7 +1097,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
return nil
}
}
- now := d.fs.clock.Now().Nanoseconds()
if stat.Mask&linux.STATX_MODE != 0 {
atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
}
@@ -930,61 +1106,104 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
if stat.Mask&linux.STATX_GID != 0 {
atomic.StoreUint32(&d.gid, stat.GID)
}
- if setLocalAtime {
- if stat.Atime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.atime, now)
- } else {
- atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
- }
+ // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
+ // if d.cachedMetadataAuthoritative() then we converted stat.Atime and
+ // stat.Mtime to client-local timestamps above, and if
+ // !d.cachedMetadataAuthoritative() then we returned after calling
+ // d.file.setAttr(). For the same reason, now must have been initialized.
+ if stat.Mask&linux.STATX_ATIME != 0 {
+ atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
+ atomic.StoreUint32(&d.atimeDirty, 0)
}
- if setLocalMtime {
- if stat.Mtime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.mtime, now)
- } else {
- atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
- }
+ if stat.Mask&linux.STATX_MTIME != 0 {
+ atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
+ atomic.StoreUint32(&d.mtimeDirty, 0)
}
atomic.StoreInt64(&d.ctime, now)
- if stat.Mask&linux.STATX_SIZE != 0 {
+ return nil
+}
+
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ err := allocate()
+ if err != nil {
+ return err
+ }
+ d.updateSizeLocked(size)
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateSizeLocked(newSize uint64) {
+ d.dataMu.Lock()
+ oldSize := d.size
+ atomic.StoreUint64(&d.size, newSize)
+ // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+ // below. This allows concurrent calls to Read/Translate/etc. These
+ // functions synchronize with truncation by refusing to use cache
+ // contents beyond the new d.size. (We are still holding d.metadataMu,
+ // so we can't race with Write or another truncate.)
+ d.dataMu.Unlock()
+ if d.size < oldSize {
+ oldpgend, _ := usermem.PageRoundUp(oldSize)
+ newpgend, _ := usermem.PageRoundUp(d.size)
+ if oldpgend != newpgend {
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+ // Compare Linux's mm/truncate.c:truncate_setsize() =>
+ // truncate_pagecache() =>
+ // mm/memory.c:unmap_mapping_range(evencows=1).
+ InvalidatePrivate: true,
+ })
+ d.mapsMu.Unlock()
+ }
+ // We are now guaranteed that there are no translations of
+ // truncated pages, and can remove them from the cache. Since
+ // truncated pages have been removed from the remote file, they
+ // should be dropped without being written back.
d.dataMu.Lock()
- oldSize := d.size
- d.size = stat.Size
- // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
- // below. This allows concurrent calls to Read/Translate/etc. These
- // functions synchronize with truncation by refusing to use cache
- // contents beyond the new d.size. (We are still holding d.metadataMu,
- // so we can't race with Write or another truncate.)
+ d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+ d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
d.dataMu.Unlock()
- if d.size < oldSize {
- oldpgend, _ := usermem.PageRoundUp(oldSize)
- newpgend, _ := usermem.PageRoundUp(d.size)
- if oldpgend != newpgend {
- d.mapsMu.Lock()
- d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
- // Compare Linux's mm/truncate.c:truncate_setsize() =>
- // truncate_pagecache() =>
- // mm/memory.c:unmap_mapping_range(evencows=1).
- InvalidatePrivate: true,
- })
- d.mapsMu.Unlock()
- }
- // We are now guaranteed that there are no translations of
- // truncated pages, and can remove them from the cache. Since
- // truncated pages have been removed from the remote file, they
- // should be dropped without being written back.
- d.dataMu.Lock()
- d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
- d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
- d.dataMu.Unlock()
- }
}
- return nil
}
func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ // We only support xattrs prefixed with "user." (see b/148380782). Currently,
+ // there is no need to expose any other xattrs through a gofer.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+ return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
func dentryUIDFromP9UID(uid p9.UID) uint32 {
if !uid.Ok() {
return uint32(auth.OverflowUID)
@@ -1020,10 +1239,10 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
+func (d *dentry) DecRef(ctx context.Context) {
if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
d.fs.renameMu.Lock()
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
d.fs.renameMu.Unlock()
} else if refs < 0 {
panic("gofer.dentry.DecRef() called without holding a reference")
@@ -1039,16 +1258,40 @@ func (d *dentry) decRefLocked() {
}
}
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+ return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ if d.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ d.fs.renameMu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
+ }
+ d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
+ d.fs.renameMu.RUnlock()
+}
// Watches implements vfs.DentryImpl.Watches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
func (d *dentry) Watches() *vfs.Watches {
- return nil
+ return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+ if atomic.LoadInt64(&d.refs) == 0 {
+ d.fs.renameMu.Lock()
+ d.checkCachingLocked(ctx)
+ d.fs.renameMu.Unlock()
+ }
}
// checkCachingLocked should be called after d's reference count becomes 0 or it
@@ -1060,13 +1303,18 @@ func (d *dentry) Watches() *vfs.Watches {
// operation. One of the calls may destroy the dentry, so subsequent calls will
// do nothing.
//
-// Preconditions: d.fs.renameMu must be locked for writing.
-func (d *dentry) checkCachingLocked() {
+// Preconditions: d.fs.renameMu must be locked for writing; it may be
+// temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context) {
// Dentries with a non-zero reference count must be retained. (The only way
// to obtain a reference on a dentry with zero references is via path
// resolution, which requires renameMu, so if d.refs is zero then it will
// remain zero while we hold renameMu for writing.)
refs := atomic.LoadInt64(&d.refs)
+ if refs == -1 {
+ // Dentry has already been destroyed.
+ return
+ }
if refs > 0 {
if d.cached {
d.fs.cachedDentries.Remove(d)
@@ -1075,21 +1323,38 @@ func (d *dentry) checkCachingLocked() {
}
return
}
- if refs == -1 {
- // Dentry has already been destroyed.
- return
- }
// Deleted and invalidated dentries with zero references are no longer
// reachable by path resolution and should be dropped immediately.
if d.vfsd.IsDead() {
+ if d.isDeleted() {
+ d.watches.HandleDeletion(ctx)
+ }
if d.cached {
d.fs.cachedDentries.Remove(d)
d.fs.cachedDentriesLen--
d.cached = false
}
- d.destroyLocked()
+ d.destroyLocked(ctx)
return
}
+ // If d still has inotify watches and it is not deleted or invalidated, we
+ // cannot cache it and allow it to be evicted. Otherwise, we will lose its
+ // watches, even if a new dentry is created for the same file in the future.
+ // Note that the size of d.watches cannot concurrently transition from zero
+ // to non-zero, because adding a watch requires holding a reference on d.
+ if d.watches.Size() > 0 {
+ return
+ }
+
+ if atomic.LoadInt32(&d.fs.released) != 0 {
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ delete(d.parent.children, d.name)
+ d.parent.dirMu.Unlock()
+ }
+ d.destroyLocked(ctx)
+ }
+
// If d is already cached, just move it to the front of the LRU.
if d.cached {
d.fs.cachedDentries.Remove(d)
@@ -1102,39 +1367,56 @@ func (d *dentry) checkCachingLocked() {
d.fs.cachedDentriesLen++
d.cached = true
if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
- victim := d.fs.cachedDentries.Back()
- d.fs.cachedDentries.Remove(victim)
- d.fs.cachedDentriesLen--
- victim.cached = false
- // victim.refs may have become non-zero from an earlier path resolution
- // since it was inserted into fs.cachedDentries.
- if atomic.LoadInt64(&victim.refs) == 0 {
- if victim.parent != nil {
- victim.parent.dirMu.Lock()
- if !victim.vfsd.IsDead() {
- // Note that victim can't be a mount point (in any mount
- // namespace), since VFS holds references on mount points.
- d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd)
- delete(victim.parent.children, victim.name)
- // We're only deleting the dentry, not the file it
- // represents, so we don't need to update
- // victimParent.dirents etc.
- }
- victim.parent.dirMu.Unlock()
- }
- victim.destroyLocked()
- }
+ d.fs.evictCachedDentryLocked(ctx)
// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
// back down to fs.opts.maxCachedDentries, so we don't loop.
}
}
-// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
-// close p9 file and remove reference on parent dentry.
+// Precondition: fs.renameMu must be locked for writing; it may be temporarily
+// unlocked.
+func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
+ for fs.cachedDentriesLen != 0 {
+ fs.evictCachedDentryLocked(ctx)
+ }
+}
+
+// Preconditions:
+// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * fs.cachedDentriesLen != 0.
+func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
+ victim := fs.cachedDentries.Back()
+ fs.cachedDentries.Remove(victim)
+ fs.cachedDentriesLen--
+ victim.cached = false
+ // victim.refs may have become non-zero from an earlier path resolution
+ // since it was inserted into fs.cachedDentries.
+ if atomic.LoadInt64(&victim.refs) == 0 {
+ if victim.parent != nil {
+ victim.parent.dirMu.Lock()
+ if !victim.vfsd.IsDead() {
+ // Note that victim can't be a mount point (in any mount
+ // namespace), since VFS holds references on mount points.
+ fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+ delete(victim.parent.children, victim.name)
+ // We're only deleting the dentry, not the file it
+ // represents, so we don't need to update
+ // victimParent.dirents etc.
+ }
+ victim.parent.dirMu.Unlock()
+ }
+ victim.destroyLocked(ctx)
+ }
+}
+
+// destroyLocked destroys the dentry.
//
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
-// not a child dentry.
-func (d *dentry) destroyLocked() {
+// Preconditions:
+// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * d.refs == 0.
+// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
+// from its former parent dentry.
+func (d *dentry) destroyLocked(ctx context.Context) {
switch atomic.LoadInt64(&d.refs) {
case 0:
// Mark the dentry destroyed.
@@ -1145,43 +1427,76 @@ func (d *dentry) destroyLocked() {
panic("dentry.destroyLocked() called with references on the dentry")
}
- ctx := context.Background()
+ // Allow the following to proceed without renameMu locked to improve
+ // scalability.
+ d.fs.renameMu.Unlock()
+
+ mf := d.fs.mfp.MemoryFile()
d.handleMu.Lock()
- if !d.handle.file.isNil() {
- mf := d.fs.mfp.MemoryFile()
- d.dataMu.Lock()
+ d.dataMu.Lock()
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty pages back to the remote filesystem.
- if d.handleWritable {
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
- log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
- }
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
}
- // Discard cached data.
+ }
+ // Discard cached data.
+ if !d.cache.IsEmpty() {
+ mf.MarkAllUnevictable(d)
d.cache.DropAll(mf)
d.dirty.RemoveAll()
- d.dataMu.Unlock()
- // Clunk open fids and close open host FDs.
- d.handle.close(ctx)
+ }
+ d.dataMu.Unlock()
+ // Clunk open fids and close open host FDs.
+ if !d.readFile.isNil() {
+ d.readFile.close(ctx)
+ }
+ if !d.writeFile.isNil() && d.readFile != d.writeFile {
+ d.writeFile.close(ctx)
+ }
+ d.readFile = p9file{}
+ d.writeFile = p9file{}
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
if !d.file.isNil() {
- d.file.close(ctx)
+ // Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+ // i.e. client and server timestamps may differ (because e.g. a client
+ // write was serviced by the page cache, and only written back to the
+ // remote file later). Ideally, we'd write client timestamps back to
+ // the remote filesystem so that timestamps for a new dentry
+ // instantiated for the same file would remain coherent. Unfortunately,
+ // this turns out to be too expensive in many cases, so for now we
+ // don't do this.
+ if err := d.file.close(ctx); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
+ }
d.file = p9file{}
+
// Remove d from the set of syncable dentries.
d.fs.syncMu.Lock()
delete(d.fs.syncableDentries, d)
d.fs.syncMu.Unlock()
}
+
+ d.fs.renameMu.Lock()
+
// Drop the reference held by d on its parent without recursively locking
// d.fs.renameMu.
if d.parent != nil {
if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
- d.parent.checkCachingLocked()
+ d.parent.checkCachingLocked(ctx)
} else if refs < 0 {
panic("gofer.dentry.DecRef() called without holding a reference")
}
}
+
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Unregister(d, "gofer.dentry")
+ }
}
func (d *dentry) isDeleted() bool {
@@ -1192,10 +1507,8 @@ func (d *dentry) setDeleted() {
atomic.StoreUint32(&d.deleted, 1)
}
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
- if d.file.isNil() {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+ if d.file.isNil() || !d.userXattrSupported() {
return nil, nil
}
xattrMap, err := d.file.listXattr(ctx, size)
@@ -1204,6 +1517,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
}
xattrs := make([]string, 0, len(xattrMap))
for x := range xattrMap {
+ // We only support xattrs in the user.* namespace.
if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
xattrs = append(xattrs, x)
}
@@ -1211,125 +1525,166 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
return xattrs, nil
}
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
if d.file.isNil() {
return "", syserror.ENODATA
}
- if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
- }
return d.file.getXattr(ctx, opts.Name, opts.Size)
}
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
return d.file.removeXattr(ctx, name)
}
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory().
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+ filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+ return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
// O_TRUNC unconditionally requires us to obtain a new handle (opened with
// O_TRUNC).
if !trunc {
d.handleMu.RLock()
- if (!read || d.handleReadable) && (!write || d.handleWritable) {
- // The current handle is sufficient.
+ if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+ // Current handles are sufficient.
d.handleMu.RUnlock()
return nil
}
d.handleMu.RUnlock()
}
- haveOldFD := false
+ fdToClose := int32(-1)
+ invalidateTranslations := false
d.handleMu.Lock()
- if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
- // Get a new handle.
- wantReadable := d.handleReadable || read
- wantWritable := d.handleWritable || write
- h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+ if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+ // Get a new handle. If this file has been opened for both reading and
+ // writing, try to get a single handle that is usable for both:
+ //
+ // - Writable memory mappings of a host FD require that the host FD is
+ // opened for both reading and writing.
+ //
+ // - NOTE(b/141991141): Some filesystems may not ensure coherence
+ // between multiple handles for the same file.
+ openReadable := !d.readFile.isNil() || read
+ openWritable := !d.writeFile.isNil() || write
+ h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ if err == syserror.EACCES && (openReadable != read || openWritable != write) {
+ // It may not be possible to use a single handle for both
+ // reading and writing, since permissions on the file may have
+ // changed to e.g. disallow reading after previously being
+ // opened for reading. In this case, we have no choice but to
+ // use separate handles for reading and writing.
+ ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
+ openReadable = read
+ openWritable = write
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
if err != nil {
d.handleMu.Unlock()
return err
}
- if !d.handle.file.isNil() {
- // Check that old and new handles are compatible: If the old handle
- // includes a host file descriptor but the new one does not, or
- // vice versa, old and new memory mappings may be incoherent.
- haveOldFD = d.handle.fd >= 0
- haveNewFD := h.fd >= 0
- if haveOldFD != haveNewFD {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
- h.close(ctx)
- return syserror.EIO
- }
- if haveOldFD {
- // We may have raced with callers of d.pf.FD() that are now
- // using the old file descriptor, preventing us from safely
- // closing it. We could handle this by invalidating existing
- // memmap.Translations, but this is expensive. Instead, use
- // dup3 to make the old file descriptor refer to the new file
- // description, then close the new file descriptor (which is no
- // longer needed). Racing callers may use the old or new file
- // description, but this doesn't matter since they refer to the
- // same file (unless d.fs.opts.overlayfsStaleRead is true,
- // which we handle separately).
- if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
+
+ if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+ // We have no existing FD, and the new FD meets the requirements
+ // for d.hostFD, so start using it.
+ d.hostFD = h.fd
+ } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
+ // We have an existing read-only FD, but the file has just been
+ // opened for writing, so we need to start supporting writable memory
+ // mappings. This may race with callers of d.pf.FD() using the existing
+ // FD, so in most cases we need to delay closing the old FD until after
+ // invalidating memmap.Translations that might have observed it.
+ if !openReadable || h.fd < 0 {
+ // We don't have a read/write FD, so we have no FD that can be
+ // used to create writable memory mappings. Switch to using the
+ // internal page cache.
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = -1
+ } else if d.fs.opts.overlayfsStaleRead {
+ // We do have a read/write FD, but it may not be coherent with
+ // the existing read-only FD, so we must switch to mappings of
+ // the new FD in both the application and sentry.
+ if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
h.close(ctx)
return err
}
- syscall.Close(int(h.fd))
- h.fd = d.handle.fd
- if d.fs.opts.overlayfsStaleRead {
- // Replace sentry mappings of the old FD with mappings of
- // the new FD, since the two are not necessarily coherent.
- if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
- h.close(ctx)
- return err
- }
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = h.fd
+ } else {
+ // We do have a read/write FD. To avoid invalidating existing
+ // memmap.Translations (which is expensive), use dup3 to make
+ // the old file descriptor refer to the new file description,
+ // then close the new file descriptor (which is no longer
+ // needed). Racing callers of d.pf.FD() may use the old or new
+ // file description, but this doesn't matter since they refer
+ // to the same file, and any racing mappings must be read-only.
+ if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
+ oldHostFD := d.hostFD
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
+ h.close(ctx)
+ return err
}
- // Clunk the old fid before making the new handle visible (by
- // unlocking d.handleMu).
- d.handle.file.close(ctx)
+ fdToClose = h.fd
}
+ } else {
+ // h.fd is not useful.
+ fdToClose = h.fd
+ }
+
+ // Switch to new fids.
+ var oldReadFile p9file
+ if openReadable {
+ oldReadFile = d.readFile
+ d.readFile = h.file
+ }
+ var oldWriteFile p9file
+ if openWritable {
+ oldWriteFile = d.writeFile
+ d.writeFile = h.file
+ }
+ // NOTE(b/141991141): Clunk old fids before making new fids visible (by
+ // unlocking d.handleMu).
+ if !oldReadFile.isNil() {
+ oldReadFile.close(ctx)
+ }
+ if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+ oldWriteFile.close(ctx)
}
- // Switch to the new handle.
- d.handle = h
- d.handleReadable = wantReadable
- d.handleWritable = wantWritable
}
d.handleMu.Unlock()
- if d.fs.opts.overlayfsStaleRead && haveOldFD {
- // Invalidate application mappings that may be using the old FD; they
+ if invalidateTranslations {
+ // Invalidate application mappings that may be using an old FD; they
// will be replaced with mappings using the new FD after future calls
// to d.Translate(). This requires holding d.mapsMu, which precedes
// d.handleMu in the lock order.
@@ -1337,35 +1692,109 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
d.mappings.InvalidateAll(memmap.InvalidateOpts{})
d.mapsMu.Unlock()
}
+ if fdToClose >= 0 {
+ syscall.Close(int(fdToClose))
+ }
+
+ return nil
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) readHandleLocked() handle {
+ return handle{
+ file: d.readFile,
+ fd: d.hostFD,
+ }
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) writeHandleLocked() handle {
+ return handle{
+ file: d.writeFile,
+ fd: d.hostFD,
+ }
+}
+
+func (d *dentry) syncRemoteFile(ctx context.Context) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.syncRemoteFileLocked(ctx)
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if d.hostFD >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(d.hostFD))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ if !d.writeFile.isNil() {
+ return d.writeFile.fsync(ctx)
+ }
+ if !d.readFile.isNil() {
+ return d.readFile.fsync(ctx)
+ }
+ return nil
+}
+func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
+ if h.isOpen() {
+ // Write back dirty pages to the remote file.
+ d.dataMu.Lock()
+ err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+ d.dataMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ if err := d.syncRemoteFileLocked(ctx); err != nil {
+ if !forFilesystemSync {
+ return err
+ }
+ // Only return err if we can reasonably have expected sync to succeed
+ // (d is a regular file and was opened for writing).
+ if d.isRegularFile() && h.isOpen() {
+ return err
+ }
+ ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
+ }
return nil
}
// incLinks increments link count.
-//
-// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
func (d *dentry) incLinks() {
- v := atomic.AddUint32(&d.nlink, 1)
- if v < 2 {
- panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
}
+ atomic.AddUint32(&d.nlink, 1)
}
// decLinks decrements link count.
-//
-// Preconditions: d.nlink > 1.
func (d *dentry) decLinks() {
- v := atomic.AddUint32(&d.nlink, ^uint32(0))
- if v == 0 {
- panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
}
+ atomic.AddUint32(&d.nlink, ^uint32(0))
}
// fileDescription is embedded by gofer implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ lockLogging sync.Once `state:"nosave"`
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -1394,25 +1823,62 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+ if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+ return err
+ }
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
+}
+
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+ d := fd.dentry()
+ if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+ d := fd.dentry()
+ if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("File lock using gofer file handled internally.")
+ })
+ return fd.LockFD.LockBSD(ctx, uid, t, block)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
- return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("Range lock using gofer file handled internally.")
+ })
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
- return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index adff39490..76f08e252 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -20,18 +20,21 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
)
func TestDestroyIdempotent(t *testing.T) {
+ ctx := contexttest.Context(t)
fs := filesystem{
- syncableDentries: make(map[*dentry]struct{}),
+ mfp: pgalloc.MemoryFileProviderFromContext(ctx),
opts: filesystemOptions{
// Test relies on no dentry being held in the cache.
maxCachedDentries: 0,
},
+ syncableDentries: make(map[*dentry]struct{}),
+ inoByQIDPath: make(map[uint64]uint64),
}
- ctx := contexttest.Context(t)
attr := &p9.Attr{
Mode: p9.ModeRegular,
}
@@ -50,7 +53,9 @@ func TestDestroyIdempotent(t *testing.T) {
}
parent.cacheNewChildLocked(child, "child")
- child.checkCachingLocked()
+ fs.renameMu.Lock()
+ defer fs.renameMu.Unlock()
+ child.checkCachingLocked(ctx)
if got := atomic.LoadInt64(&child.refs); got != -1 {
t.Fatalf("child.refs=%d, want: -1", got)
}
@@ -58,6 +63,6 @@ func TestDestroyIdempotent(t *testing.T) {
if got := atomic.LoadInt64(&parent.refs); got != -1 {
t.Fatalf("parent.refs=%d, want: -1", got)
}
- child.checkCachingLocked()
- child.checkCachingLocked()
+ child.checkCachingLocked(ctx)
+ child.checkCachingLocked(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 724a3f1f7..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
// handle represents a remote "open file descriptor", consisting of an opened
// fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
type handle struct {
file p9file
fd int32 // -1 if unavailable
@@ -63,6 +65,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand
}, nil
}
+func (h *handle) isOpen() bool {
+ return !h.file.isNil()
+}
+
func (h *handle) close(ctx context.Context) {
h.file.close(ctx)
h.file = p9file{}
@@ -124,13 +130,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
}
return cp, cperr
}
-
-func (h *handle) sync(ctx context.Context) error {
- if h.fd >= 0 {
- ctx.UninterruptibleSleepStart(false)
- err := syscall.Fsync(int(h.fd))
- ctx.UninterruptibleSleepFinish(false)
- return err
- }
- return h.file.fsync(ctx)
-}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
index 7294de7d6..c7bf10007 100644
--- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -51,8 +51,24 @@ func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
if ok {
return nil
}
- if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
- return err
+ if sleepErr := sleepBetweenNamedPipeOpenChecks(ctx); sleepErr != nil {
+ // Another application thread may have opened this pipe for
+ // writing, succeeded because we previously opened the pipe for
+ // reading, and subsequently interrupted us for checkpointing (e.g.
+ // this occurs in mknod tests under cooperative save/restore). In
+ // this case, our open has to succeed for the checkpoint to include
+ // a readable FD for the pipe, which is in turn necessary to
+ // restore the other thread's writable FD for the same pipe
+ // (otherwise it will get ENXIO). So we have to check
+ // nonblockingPipeHasWriter() once last time.
+ ok, err := nonblockingPipeHasWriter(fd)
+ if err != nil {
+ return err
+ }
+ if ok {
+ return nil
+ }
+ return sleepErr
}
}
}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
return err
}
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.SetAttrClose(valid, attr)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
ctx.UninterruptibleSleepStart(false)
fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 0d10cf7ac..dc8a890cb 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -18,19 +18,19 @@ import (
"fmt"
"io"
"math"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -39,16 +39,17 @@ func (d *dentry) isRegularFile() bool {
return d.fileType() == linux.S_IFREG
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
+func (fd *regularFileFD) Release(context.Context) {
}
// OnClose implements vfs.FileDescriptionImpl.OnClose.
@@ -56,15 +57,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
if !fd.vfsfd.IsWritable() {
return nil
}
- // Skip flushing if writes may be buffered by the client, since (as with
- // the VFS1 client) we don't flush buffered writes on close anyway.
+ // Skip flushing if there are client-buffered writes, since (as with the
+ // VFS1 client) we don't flush buffered writes on close anyway.
d := fd.dentry()
- if d.fs.opts.interop == InteropModeExclusive {
+ if d.fs.opts.interop != InteropModeExclusive {
+ return nil
+ }
+ d.dataMu.RLock()
+ haveDirtyPages := !d.dirty.IsEmpty()
+ d.dataMu.RUnlock()
+ if haveDirtyPages {
return nil
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return d.handle.file.flush(ctx)
+ if d.writeFile.isNil() {
+ return nil
+ }
+ return d.writeFile.flush(ctx)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ d := fd.dentry()
+ return d.doAllocate(ctx, offset, length, func() error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
}
// PRead implements vfs.FileDescriptionImpl.PRead.
@@ -72,17 +92,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
if offset < 0 {
return 0, syserror.EINVAL
}
- if opts.Flags != 0 {
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
return 0, syserror.EOPNOTSUPP
}
// Check for reading at EOF before calling into MM (but not under
// InteropModeShared, which makes d.size unreliable).
d := fd.dentry()
- if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+ if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
return 0, io.EOF
}
+ var (
+ n int64
+ readErr error
+ )
if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
// Lock d.metadataMu for the rest of the read to prevent d.size from
// changing.
@@ -93,20 +121,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
return 0, err
}
- }
-
- rw := getDentryReadWriter(ctx, d, offset)
- if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ rw := getDentryReadWriter(ctx, d, offset)
// Require the read to go to the remote file.
rw.direct = true
+ n, readErr = dst.CopyOutFrom(ctx, rw)
+ putDentryReadWriter(rw)
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ d.touchAtimeLocked(fd.vfsfd.Mount())
+ }
+ } else {
+ rw := getDentryReadWriter(ctx, d, offset)
+ n, readErr = dst.CopyOutFrom(ctx, rw)
+ putDentryReadWriter(rw)
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ d.touchAtime(fd.vfsfd.Mount())
+ }
}
- n, err := dst.CopyOutFrom(ctx, rw)
- putDentryReadWriter(rw)
- if d.fs.opts.interop != InteropModeShared {
- // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
- d.touchAtime(fd.vfsfd.Mount())
- }
- return n, err
+ return n, readErr
}
// Read implements vfs.FileDescriptionImpl.Read.
@@ -120,85 +153,134 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
- if opts.Flags != 0 {
- return 0, syserror.EOPNOTSUPP
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ d := fd.dentry()
+ // If the fd was opened with O_APPEND, make sure the file size is updated.
+ // There is a possible race here if size is modified externally after
+ // metadata cache is updated.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the fd was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
}
limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(limit)
- d := fd.dentry()
- d.metadataMu.Lock()
- defer d.metadataMu.Unlock()
if d.fs.opts.interop != InteropModeShared {
// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
// file_update_time(). This is d.touchCMtime(), but without locking
// d.metadataMu (recursively).
d.touchCMtimeLocked()
}
- if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
- // Write dirty cached pages that will be touched by the write back to
- // the remote file.
- if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
- return 0, err
- }
- // Remove touched pages from the cache.
- pgstart := usermem.PageRoundDown(uint64(offset))
- pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
- if !ok {
- return 0, syserror.EINVAL
- }
- mr := memmap.MappableRange{pgstart, pgend}
- var freed []platform.FileRange
- d.dataMu.Lock()
- cseg := d.cache.LowerBoundSegment(mr.Start)
- for cseg.Ok() && cseg.Start() < mr.End {
- cseg = d.cache.Isolate(cseg, mr)
- freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
- cseg = d.cache.Remove(cseg).NextSegment()
- }
- d.dataMu.Unlock()
- // Invalidate mappings of removed pages.
- d.mapsMu.Lock()
- d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
- d.mapsMu.Unlock()
- // Finally free pages removed from the cache.
- mf := d.fs.mfp.MemoryFile()
- for _, freedFR := range freed {
- mf.DecRef(freedFR)
- }
- }
+
rw := getDentryReadWriter(ctx, d, offset)
+ defer putDentryReadWriter(rw)
+
if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ if err := fd.writeCache(ctx, d, offset, src); err != nil {
+ return 0, offset, err
+ }
+
// Require the write to go to the remote file.
rw.direct = true
}
+
n, err := src.CopyInTo(ctx, rw)
- putDentryReadWriter(rw)
- if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ if err != nil {
+ return n, offset + n, err
+ }
+ if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ // Note that if any of the following fail, then we can't guarantee that
+ // any data was actually written with the semantics of O_DSYNC or
+ // O_SYNC, so we return zero bytes written. Compare Linux's
+ // mm/filemap.c:generic_file_write_iter() =>
+ // include/linux/fs.h:generic_write_sync().
+ //
// Write dirty cached pages touched by the write back to the remote
// file.
if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
- return 0, err
+ return 0, offset, err
}
// Request the remote filesystem to sync the remote file.
- if err := d.handle.file.fsync(ctx); err != nil {
- return 0, err
+ if err := d.syncRemoteFile(ctx); err != nil {
+ return 0, offset, err
}
}
- return n, err
+ return n, offset + n, nil
+}
+
+func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
+ // Write dirty cached pages that will be touched by the write back to
+ // the remote file.
+ if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+ return err
+ }
+
+ // Remove touched pages from the cache.
+ pgstart := usermem.PageRoundDown(uint64(offset))
+ pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+ if !ok {
+ return syserror.EINVAL
+ }
+ mr := memmap.MappableRange{pgstart, pgend}
+ var freed []memmap.FileRange
+
+ d.dataMu.Lock()
+ cseg := d.cache.LowerBoundSegment(mr.Start)
+ for cseg.Ok() && cseg.Start() < mr.End {
+ cseg = d.cache.Isolate(cseg, mr)
+ freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+ cseg = d.cache.Remove(cseg).NextSegment()
+ }
+ d.dataMu.Unlock()
+
+ // Invalidate mappings of removed pages.
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+ d.mapsMu.Unlock()
+
+ // Finally free pages removed from the cache.
+ mf := d.fs.mfp.MemoryFile()
+ for _, freedFR := range freed {
+ mf.DecRef(freedFR)
+ }
+ return nil
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.mu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.mu.Unlock()
return n, err
}
@@ -241,10 +323,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents and makes dentry.size
// unreliable), or if the file was opened O_DIRECT, read directly from
- // dentry.handle without locking dentry.dataMu.
+ // dentry.readHandleLocked() without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+ h := rw.d.readHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
rw.d.handleMu.RUnlock()
rw.off += n
return n, err
@@ -312,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
End: gapEnd,
}
optMR := gap.Range()
- err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+ err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.d.cache.Find(rw.off)
if !seg.Ok() {
@@ -320,14 +403,14 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
rw.d.handleMu.RUnlock()
return done, err
}
- // err might have occurred in part of gap.Range() outside
- // gapMR. Forget about it for now; if the error matters and
- // persists, we'll run into it again in a later iteration of
- // this loop.
+ // err might have occurred in part of gap.Range() outside gapMR
+ // (in particular, gap.End() might be beyond EOF). Forget about
+ // it for now; if the error matters and persists, we'll run
+ // into it again in a later iteration of this loop.
} else {
// Read directly from the file.
gapDsts := dsts.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+ n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
done += n
rw.off += n
dsts = dsts.DropFirst64(n)
@@ -359,11 +442,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// If we have a mmappable host FD (which must be used here to ensure
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents), or if the file was
- // opened with O_DIRECT, write directly to dentry.handle without locking
- // dentry.dataMu.
+ // opened with O_DIRECT, write directly to dentry.writeHandleLocked()
+ // without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ h := rw.d.writeHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
rw.off += n
rw.d.dataMu.Lock()
if rw.off > rw.d.size {
@@ -425,7 +509,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// for detecting or avoiding this.
gapMR := gap.Range().Intersect(mr)
gapSrcs := srcs.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+ n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
done += n
rw.off += n
srcs = srcs.DropFirst64(n)
@@ -451,7 +535,7 @@ exitLoop:
if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
Start: start,
End: rw.off,
- }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+ }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
// We have no idea how many bytes were actually flushed.
rw.off = start
done = 0
@@ -469,6 +553,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
// Compute the range of valid bytes (overflow-checked).
@@ -482,22 +567,31 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
return fsutil.SyncDirty(ctx, memmap.MappableRange{
Start: uint64(offset),
End: uint64(end),
- }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+ }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
}
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
+ }
+ fd.off = newOffset
+ return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
switch whence {
case linux.SEEK_SET:
// Use offset as specified.
case linux.SEEK_CUR:
- offset += fd.off
+ offset += fdOffset
case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
// Ensure file size is up to date.
- d := fd.dentry()
- if fd.filesystem().opts.interop == InteropModeShared {
+ if !d.cachedMetadataAuthoritative() {
if err := d.updateFromGetattr(ctx); err != nil {
return 0, err
}
@@ -525,31 +619,12 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
if offset < 0 {
return 0, syserror.EINVAL
}
- fd.off = offset
return offset, nil
}
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *regularFileFD) Sync(ctx context.Context) error {
- return fd.dentry().syncSharedHandle(ctx)
-}
-
-func (d *dentry) syncSharedHandle(ctx context.Context) error {
- d.handleMu.RLock()
- if !d.handleWritable {
- d.handleMu.RUnlock()
- return nil
- }
- d.dataMu.Lock()
- // Write dirty cached data to the remote file.
- err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
- d.dataMu.Unlock()
- if err == nil {
- // Sync the remote file.
- err = d.handle.sync(ctx)
- }
- d.handleMu.RUnlock()
- return err
+ return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
}
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -573,7 +648,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
return syserror.ENODEV
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
if !haveFD {
return syserror.ENODEV
@@ -594,7 +669,7 @@ func (d *dentry) mayCachePages() bool {
return true
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
return haveFD
}
@@ -652,7 +727,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
// Translate implements memmap.Mappable.Translate.
func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
d.handleMu.RLock()
- if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+ if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
d.handleMu.RUnlock()
mr := optional
if d.fs.opts.limitHostFDTranslation {
@@ -688,7 +763,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
}
mf := d.fs.mfp.MemoryFile()
- cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+ h := d.readHandleLocked()
+ cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
@@ -747,7 +823,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
- // Whether we have a host fd (and consequently what platform.File is
+ // Whether we have a host fd (and consequently what memmap.File is
// mapped) can change across save/restore, so invalidate all translations
// unconditionally.
d.mapsMu.Lock()
@@ -757,9 +833,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Write the cache's contents back to the remote file so that if we have a
// host fd after restore, the remote file's contents are coherent.
mf := d.fs.mfp.MemoryFile()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
return err
}
@@ -774,20 +853,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Evict implements pgalloc.EvictableMemoryUser.Evict.
func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+ mr := memmap.MappableRange{er.Start, er.End}
+ mf := d.fs.mfp.MemoryFile()
d.mapsMu.Lock()
defer d.mapsMu.Unlock()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- mr := memmap.MappableRange{er.Start, er.End}
- mf := d.fs.mfp.MemoryFile()
// Only allow pages that are no longer memory-mapped to be evicted.
for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
mgapMR := mgap.Range().Intersect(mr)
if mgapMR.Length() == 0 {
continue
}
- if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
}
d.cache.Drop(mgapMR, mf)
@@ -795,53 +877,53 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
}
}
-// dentryPlatformFile implements platform.File. It exists solely because dentry
-// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
//
// dentryPlatformFile is only used when a host FD representing the remote file
-// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
-// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
type dentryPlatformFile struct {
*dentry
- // fdRefs counts references on platform.File offsets. fdRefs is protected
+ // fdRefs counts references on memmap.File offsets. fdRefs is protected
// by dentry.dataMu.
fdRefs fsutil.FrameRefSet
- // If this dentry represents a regular file, and handle.fd >= 0,
- // hostFileMapper caches mappings of handle.fd.
+ // If this dentry represents a regular file, and dentry.hostFD >= 0,
+ // hostFileMapper caches mappings of dentry.hostFD.
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
- hostFileMapperInitOnce sync.Once
+ hostFileMapperInitOnce sync.Once `state:"nosave"`
}
-// IncRef implements platform.File.IncRef.
-func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
d.dataMu.Lock()
d.fdRefs.IncRefAndAccount(fr)
d.dataMu.Unlock()
}
-// DecRef implements platform.File.DecRef.
-func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
d.dataMu.Lock()
d.fdRefs.DecRefAndAccount(fr)
d.dataMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal.
-func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
d.handleMu.RLock()
- bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
- d.handleMu.RUnlock()
- return bs, err
+ defer d.handleMu.RUnlock()
+ return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (d *dentryPlatformFile) FD() int {
d.handleMu.RLock()
- fd := d.handle.fd
- d.handleMu.RUnlock()
- return int(fd)
+ defer d.handleMu.RUnlock()
+ return int(d.hostFD)
}
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
new file mode 100644
index 000000000..2ea224c43
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -0,0 +1,329 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "fmt"
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type saveRestoreContextID int
+
+const (
+ // CtxRestoreServerFDMap is a Context.Value key for a map[string]int
+ // mapping filesystem unique IDs (cf. InternalFilesystemOptions.UniqueID)
+ // to host FDs.
+ CtxRestoreServerFDMap saveRestoreContextID = iota
+)
+
+// +stateify savable
+type savedDentryRW struct {
+ read bool
+ write bool
+}
+
+// PreprareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave.
+func (fs *filesystem) PrepareSave(ctx context.Context) error {
+ if len(fs.iopts.UniqueID) == 0 {
+ return fmt.Errorf("gofer.filesystem with no UniqueID cannot be saved")
+ }
+
+ // Purge cached dentries, which may not be reopenable after restore due to
+ // permission changes.
+ fs.renameMu.Lock()
+ fs.evictAllCachedDentriesLocked(ctx)
+ fs.renameMu.Unlock()
+
+ // Buffer pipe data so that it's available for reading after restore. (This
+ // is a legacy VFS1 feature.)
+ fs.syncMu.Lock()
+ for sffd := range fs.specialFileFDs {
+ if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() {
+ if err := sffd.savePipeData(ctx); err != nil {
+ fs.syncMu.Unlock()
+ return err
+ }
+ }
+ }
+ fs.syncMu.Unlock()
+
+ // Flush local state to the remote filesystem.
+ if err := fs.Sync(ctx); err != nil {
+ return err
+ }
+
+ fs.savedDentryRW = make(map[*dentry]savedDentryRW)
+ return fs.root.prepareSaveRecursive(ctx)
+}
+
+// Preconditions:
+// * fd represents a pipe.
+// * fd is readable.
+func (fd *specialFileFD) savePipeData(ctx context.Context) error {
+ fd.bufMu.Lock()
+ defer fd.bufMu.Unlock()
+ var buf [usermem.PageSize]byte
+ for {
+ n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), ^uint64(0))
+ if n != 0 {
+ fd.buf = append(fd.buf, buf[:n]...)
+ }
+ if err != nil {
+ if err == io.EOF || err == syserror.EAGAIN {
+ break
+ }
+ return err
+ }
+ }
+ if len(fd.buf) != 0 {
+ atomic.StoreUint32(&fd.haveBuf, 1)
+ }
+ return nil
+}
+
+func (d *dentry) prepareSaveRecursive(ctx context.Context) error {
+ if d.isRegularFile() && !d.cachedMetadataAuthoritative() {
+ // Get updated metadata for d in case we need to perform metadata
+ // validation during restore.
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return err
+ }
+ }
+ if !d.readFile.isNil() || !d.writeFile.isNil() {
+ d.fs.savedDentryRW[d] = savedDentryRW{
+ read: !d.readFile.isNil(),
+ write: !d.writeFile.isNil(),
+ }
+ }
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ for _, child := range d.children {
+ if child != nil {
+ if err := child.prepareSaveRecursive(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// beforeSave is invoked by stateify.
+func (d *dentry) beforeSave() {
+ if d.vfsd.IsDead() {
+ panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: deleted and invalidated dentries can't be restored", genericDebugPathname(d)))
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentry) afterLoad() {
+ d.hostFD = -1
+ if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+ refsvfs2.Register(d, "gofer.dentry")
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentryPlatformFile) afterLoad() {
+ if d.hostFileMapper.IsInited() {
+ // Ensure that we don't call d.hostFileMapper.Init() again.
+ d.hostFileMapperInitOnce.Do(func() {})
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (fd *specialFileFD) afterLoad() {
+ fd.handle.fd = -1
+}
+
+// CompleteRestore implements
+// vfs.FilesystemImplSaveRestoreExtension.CompleteRestore.
+func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error {
+ fdmapv := ctx.Value(CtxRestoreServerFDMap)
+ if fdmapv == nil {
+ return fmt.Errorf("no server FD map available")
+ }
+ fdmap := fdmapv.(map[string]int)
+ fd, ok := fdmap[fs.iopts.UniqueID]
+ if !ok {
+ return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID)
+ }
+ fs.opts.fd = fd
+ if err := fs.dial(ctx); err != nil {
+ return err
+ }
+ fs.inoByQIDPath = make(map[uint64]uint64)
+
+ // Restore the filesystem root.
+ ctx.UninterruptibleSleepStart(false)
+ attached, err := fs.client.Attach(fs.opts.aname)
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ return err
+ }
+ attachFile := p9file{attached}
+ qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+ if err != nil {
+ return err
+ }
+ if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil {
+ return err
+ }
+
+ // Restore remaining dentries.
+ if err := fs.root.restoreDescendantsRecursive(ctx, &opts); err != nil {
+ return err
+ }
+
+ // Re-open handles for specialFileFDs. Unlike the initial open
+ // (dentry.openSpecialFile()), pipes are always opened without blocking;
+ // non-readable pipe FDs are opened last to ensure that they don't get
+ // ENXIO if another specialFileFD represents the read end of the same pipe.
+ // This is consistent with VFS1.
+ haveWriteOnlyPipes := false
+ for fd := range fs.specialFileFDs {
+ if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+ haveWriteOnlyPipes = true
+ continue
+ }
+ if err := fd.completeRestore(ctx); err != nil {
+ return err
+ }
+ }
+ if haveWriteOnlyPipes {
+ for fd := range fs.specialFileFDs {
+ if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+ if err := fd.completeRestore(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ }
+
+ // Discard state only required during restore.
+ fs.savedDentryRW = nil
+
+ return nil
+}
+
+func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrMask p9.AttrMask, attr *p9.Attr, opts *vfs.CompleteRestoreOptions) error {
+ d.file = file
+
+ // Gofers do not preserve QID across checkpoint/restore, so:
+ //
+ // - We must assume that the remote filesystem did not change in a way that
+ // would invalidate dentries, since we can't revalidate dentries by
+ // checking QIDs.
+ //
+ // - We need to associate the new QID.Path with the existing d.ino.
+ d.qidPath = qid.Path
+ d.fs.inoMu.Lock()
+ d.fs.inoByQIDPath[qid.Path] = d.ino
+ d.fs.inoMu.Unlock()
+
+ // Check metadata stability before updating metadata.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ if d.isRegularFile() {
+ if opts.ValidateFileSizes {
+ if !attrMask.Size {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d))
+ }
+ if d.size != attr.Size {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, attr.Size)
+ }
+ }
+ if opts.ValidateFileModificationTimestamps {
+ if !attrMask.MTime {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d))
+ }
+ if want := dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds); d.mtime != want {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want))
+ }
+ }
+ }
+ if !d.cachedMetadataAuthoritative() {
+ d.updateFromP9AttrsLocked(attrMask, attr)
+ }
+
+ if rw, ok := d.fs.savedDentryRW[d]; ok {
+ if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+// Preconditions: d is not synthetic.
+func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+ for _, child := range d.children {
+ if child == nil {
+ continue
+ }
+ if _, ok := d.fs.syncableDentries[child]; !ok {
+ // child is synthetic.
+ continue
+ }
+ if err := child.restoreRecursive(ctx, opts); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Preconditions: d is not synthetic (but note that since this function
+// restores d.file, d.file.isNil() is always true at this point, so this can
+// only be detected by checking filesystem.syncableDentries). d.parent has been
+// restored.
+func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+ qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name)
+ if err != nil {
+ return err
+ }
+ if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil {
+ return err
+ }
+ return d.restoreDescendantsRecursive(ctx, opts)
+}
+
+func (fd *specialFileFD) completeRestore(ctx context.Context) error {
+ d := fd.dentry()
+ h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+ if err != nil {
+ return err
+ }
+ fd.handle = h
+
+ ftype := d.fileType()
+ fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0
+ if fd.haveQueue {
+ if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index d6dbe9092..a21199eac 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,13 +36,12 @@ func (d *dentry) isSocket() bool {
// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
// is called and either BoundEndpoint.BidirectionalConnect or
// BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
type endpoint struct {
// dentry is the filesystem dentry which produced this endpoint.
dentry *dentry
- // file is the p9 file that contains a single unopened fid.
- file p9.File
-
// path is the sentry path where this endpoint is bound.
path string
}
@@ -108,13 +107,13 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
// We don't need the receiver.
c.CloseRecv()
- c.Release()
+ c.Release(ctx)
return c, nil
}
func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
- hostFile, err := e.file.Connect(flags)
+ hostFile, err := e.dentry.file.connect(ctx, flags)
if err != nil {
return nil, syserr.ErrConnectionRefused
}
@@ -129,15 +128,15 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla
c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
if serr != nil {
- log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+ log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr)
return nil, serr
}
return c, nil
}
// Release implements transport.BoundEndpoint.Release.
-func (e *endpoint) Release() {
- e.dentry.DecRef()
+func (e *endpoint) Release(ctx context.Context) {
+ e.dentry.DecRef(ctx)
}
// Passcred implements transport.BoundEndpoint.Passcred.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index ff6126b87..625400c0b 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -15,52 +15,73 @@
package gofer
import (
- "sync"
+ "sync/atomic"
+ "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
-// special files, and (when filesystemOptions.specialRegularFiles is in effect)
-// regular files. specialFileFD differs from regularFileFD by using per-FD
-// handles instead of shared per-dentry handles, and never buffering I/O.
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
type specialFileFD struct {
fileDescription
// handle is used for file I/O. handle is immutable.
- handle handle
+ handle handle `state:"nosave"`
+
+ // isRegularFile is true if this FD represents a regular file which is only
+ // possible when filesystemOptions.regularFilesUseSpecialFileFD is in
+ // effect. isRegularFile is immutable.
+ isRegularFile bool
// seekable is true if this file description represents a file for which
- // file offset is significant, i.e. a regular file. seekable is immutable.
+ // file offset is significant, i.e. a regular file, character device or
+ // block device. seekable is immutable.
seekable bool
- // mayBlock is true if this file description represents a file for which
- // queue may send I/O readiness events. mayBlock is immutable.
- mayBlock bool
- queue waiter.Queue
+ // haveQueue is true if this file description represents a file for which
+ // queue may send I/O readiness events. haveQueue is immutable.
+ haveQueue bool `state:"nosave"`
+ queue waiter.Queue
// If seekable is true, off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
+
+ // If haveBuf is non-zero, this FD represents a pipe, and buf contains data
+ // read from the pipe from previous calls to specialFileFD.savePipeData().
+ // haveBuf and buf are protected by bufMu. haveBuf is accessed using atomic
+ // memory operations.
+ bufMu sync.Mutex `state:"nosave"`
+ haveBuf uint32
+ buf []byte
}
-func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
ftype := d.fileType()
- seekable := ftype == linux.S_IFREG
- mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK
+ seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK
+ haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
fd := &specialFileFD{
- handle: h,
- seekable: seekable,
- mayBlock: mayBlock,
+ handle: h,
+ isRegularFile: ftype == linux.S_IFREG,
+ seekable: seekable,
+ haveQueue: haveQueue,
}
- if mayBlock && h.fd >= 0 {
+ fd.LockFD.Init(locks)
+ if haveQueue {
if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
return nil, err
}
@@ -69,20 +90,23 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci
DenyPRead: !seekable,
DenyPWrite: !seekable,
}); err != nil {
- if mayBlock && h.fd >= 0 {
+ if haveQueue {
fdnotifier.RemoveFD(h.fd)
}
return nil, err
}
+ d.fs.syncMu.Lock()
+ d.fs.specialFileFDs[fd] = struct{}{}
+ d.fs.syncMu.Unlock()
return fd, nil
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *specialFileFD) Release() {
- if fd.mayBlock && fd.handle.fd >= 0 {
+func (fd *specialFileFD) Release(ctx context.Context) {
+ if fd.haveQueue {
fdnotifier.RemoveFD(fd.handle.fd)
}
- fd.handle.close(context.Background())
+ fd.handle.close(ctx)
fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
fs.syncMu.Lock()
delete(fs.specialFileFDs, fd)
@@ -99,7 +123,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
// Readiness implements waiter.Waitable.Readiness.
func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
- if fd.mayBlock {
+ if fd.haveQueue {
return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
}
return fd.fileDescription.Readiness(mask)
@@ -107,8 +131,9 @@ func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
// EventRegister implements waiter.Waitable.EventRegister.
func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- if fd.mayBlock {
+ if fd.haveQueue {
fd.queue.EventRegister(e, mask)
+ fdnotifier.UpdateFD(fd.handle.fd)
return
}
fd.fileDescription.EventRegister(e, mask)
@@ -116,42 +141,82 @@ func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
// EventUnregister implements waiter.Waitable.EventUnregister.
func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
- if fd.mayBlock {
+ if fd.haveQueue {
fd.queue.EventUnregister(e)
+ fdnotifier.UpdateFD(fd.handle.fd)
return
}
fd.fileDescription.EventUnregister(e)
}
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if fd.isRegularFile {
+ d := fd.dentry()
+ return d.doAllocate(ctx, offset, length, func() error {
+ return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
+ }
+ return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if fd.seekable && offset < 0 {
return 0, syserror.EINVAL
}
- if opts.Flags != 0 {
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
return 0, syserror.EOPNOTSUPP
}
- // Going through dst.CopyOutFrom() holds MM locks around file operations of
- // unknown duration. For regularFileFD, doing so is necessary to support
- // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
- // hold here since specialFileFD doesn't client-cache data. Just buffer the
- // read instead.
- if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+ if d := fd.dentry(); d.cachedMetadataAuthoritative() {
d.touchAtime(fd.vfsfd.Mount())
}
+
+ bufN := int64(0)
+ if atomic.LoadUint32(&fd.haveBuf) != 0 {
+ var err error
+ fd.bufMu.Lock()
+ if len(fd.buf) != 0 {
+ var n int
+ n, err = dst.CopyOut(ctx, fd.buf)
+ dst = dst.DropFirst(n)
+ fd.buf = fd.buf[n:]
+ if len(fd.buf) == 0 {
+ atomic.StoreUint32(&fd.haveBuf, 0)
+ fd.buf = nil
+ }
+ bufN = int64(n)
+ if offset >= 0 {
+ offset += bufN
+ }
+ }
+ fd.bufMu.Unlock()
+ if err != nil {
+ return bufN, err
+ }
+ }
+
+ // Going through dst.CopyOutFrom() would hold MM locks around file
+ // operations of unknown duration. For regularFileFD, doing so is necessary
+ // to support mmap due to lock ordering; MM locks precede dentry.dataMu.
+ // That doesn't hold here since specialFileFD doesn't client-cache data.
+ // Just buffer the read instead.
buf := make([]byte, dst.NumBytes())
n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
if err == syserror.EAGAIN {
err = syserror.ErrWouldBlock
}
if n == 0 {
- return 0, err
+ return bufN, err
}
if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
- return int64(cp), cperr
+ return bufN + int64(cp), cperr
}
- return int64(n), err
+ return bufN + int64(n), err
}
// Read implements vfs.FileDescriptionImpl.Read.
@@ -169,35 +234,82 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if fd.seekable && offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
- if opts.Flags != 0 {
- return 0, syserror.EOPNOTSUPP
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
}
- if fd.seekable {
+ d := fd.dentry()
+ if fd.isRegularFile {
+ // If the regular file fd was opened with O_APPEND, make sure the file
+ // size is updated. There is a possible race here if size is modified
+ // externally after metadata cache is updated.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+
+ // We need to hold the metadataMu *while* writing to a regular file.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the regular file was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
+ }
limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(limit)
}
// Do a buffered write. See rationale in PRead.
- if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+ if d.cachedMetadataAuthoritative() {
d.touchCMtime()
}
buf := make([]byte, src.NumBytes())
- // Don't do partial writes if we get a partial read from src.
- if _, err := src.CopyIn(ctx, buf); err != nil {
- return 0, err
+ copied, copyErr := src.CopyIn(ctx, buf)
+ if copied == 0 && copyErr != nil {
+ // Only return the error if we didn't get any data.
+ return 0, offset, copyErr
}
- n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
if err == syserror.EAGAIN {
err = syserror.ErrWouldBlock
}
- return int64(n), err
+ // Update offset if the offset is valid.
+ if offset >= 0 {
+ offset += int64(n)
+ }
+ // Update file size for regular files.
+ if fd.isRegularFile {
+ // d.metadataMu is already locked at this point.
+ if uint64(offset) > d.size {
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ atomic.StoreUint64(&d.size, uint64(offset))
+ }
+ }
+ if err != nil {
+ return int64(n), offset, err
+ }
+ return int64(n), offset, copyErr
}
// Write implements vfs.FileDescriptionImpl.Write.
@@ -207,8 +319,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
}
fd.mu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.mu.Unlock()
return n, err
}
@@ -220,27 +332,41 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
}
fd.mu.Lock()
defer fd.mu.Unlock()
- switch whence {
- case linux.SEEK_SET:
- // Use offset as given.
- case linux.SEEK_CUR:
- offset += fd.off
- default:
- // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
- // clear that file size is even meaningful for these files.
- return 0, syserror.EINVAL
- }
- if offset < 0 {
- return 0, syserror.EINVAL
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
}
- fd.off = offset
- return offset, nil
+ fd.off = newOffset
+ return newOffset, nil
}
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *specialFileFD) Sync(ctx context.Context) error {
- if !fd.vfsfd.IsWritable() {
- return nil
+ return fd.sync(ctx, false /* forFilesystemSync */)
+}
+
+func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error {
+ err := func() error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if fd.handle.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(fd.handle.fd))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ return fd.handle.file.fsync(ctx)
+ }()
+ if err != nil {
+ if !forFilesystemSync {
+ return err
+ }
+ // Only return err if we can reasonably have expected sync to succeed
+ // (fd represents a regular file that was opened for writing).
+ if fd.isRegularFile && fd.vfsfd.IsWritable() {
+ return err
+ }
+ ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err)
}
- return fd.handle.sync(ctx)
+ return nil
}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2608e7e1d..9cbe805b9 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -17,7 +17,6 @@ package gofer
import (
"sync/atomic"
- "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -25,31 +24,39 @@ func dentryTimestampFromP9(s, ns uint64) int64 {
return int64(s*1e9 + ns)
}
-func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
- return ts.Sec*1e9 + int64(ts.Nsec)
-}
-
-func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
- return linux.StatxTimestamp{
- Sec: ns / 1e9,
- Nsec: uint32(ns % 1e9),
- }
-}
-
-// Preconditions: fs.interop != InteropModeShared.
+// Preconditions: d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtime(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime || mnt.ReadOnly() {
+ return
+ }
if err := mnt.CheckBeginWrite(); err != nil {
return
}
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
d.metadataMu.Unlock()
mnt.EndWrite()
}
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime || mnt.ReadOnly() {
+ return
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return
+ }
+ now := d.fs.clock.Now().Nanoseconds()
+ atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
+ mnt.EndWrite()
+}
+
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
@@ -57,18 +64,24 @@ func (d *dentry) touchCtime() {
d.metadataMu.Unlock()
}
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCMtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
d.metadataMu.Unlock()
}
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
func (d *dentry) touchCMtimeLocked() {
now := d.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index ca0fe6d2b..dc0f86061 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,14 +1,40 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "host",
+ prefix = "inode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
+go_template_instance(
+ name = "connected_endpoint_refs",
+ out = "connected_endpoint_refs.go",
+ package = "host",
+ prefix = "ConnectedEndpoint",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "ConnectedEndpoint",
+ },
+)
+
go_library(
name = "host",
srcs = [
+ "connected_endpoint_refs.go",
"control.go",
"host.go",
+ "inode_refs.go",
"ioctl_unsafe.go",
"mmap.go",
+ "save_restore.go",
"socket.go",
"socket_iovec.go",
"socket_unsafe.go",
@@ -22,17 +48,20 @@ go_library(
"//pkg/context",
"//pkg/fdnotifier",
"//pkg/fspath",
+ "//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/hostfd",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/platform",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
index b9082a20f..13ef48cb5 100644
--- a/pkg/sentry/fsimpl/host/control.go
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (c *scmRights) Release() {
+func (c *scmRights) Release(ctx context.Context) {
for _, fd := range c.fds {
syscall.Close(fd)
}
@@ -79,7 +79,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
}
// Create the file backed by hostFD.
- file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+ file, err := NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &NewFDOptions{})
if err != nil {
ctx.Warningf("Error creating file from host FD: %v", err)
break
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 18b127521..eeed0f97d 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -19,6 +19,7 @@ package host
import (
"fmt"
"math"
+ "sync/atomic"
"syscall"
"golang.org/x/sys/unix"
@@ -27,7 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -40,8 +41,123 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
+// inode implements kernfs.Inode.
+//
+// +stateify savable
+type inode struct {
+ kernfs.InodeNoStatFS
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
+
+ locks vfs.FileLocks
+
+ // When the reference count reaches zero, the host fd is closed.
+ inodeRefs
+
+ // hostFD contains the host fd that this file was originally created from,
+ // which must be available at time of restore.
+ //
+ // This field is initialized at creation time and is immutable.
+ hostFD int
+
+ // ino is an inode number unique within this filesystem.
+ //
+ // This field is initialized at creation time and is immutable.
+ ino uint64
+
+ // ftype is the file's type (a linux.S_IFMT mask).
+ //
+ // This field is initialized at creation time and is immutable.
+ ftype uint16
+
+ // mayBlock is true if hostFD is non-blocking, and operations on it may
+ // return EAGAIN or EWOULDBLOCK instead of blocking.
+ //
+ // This field is initialized at creation time and is immutable.
+ mayBlock bool
+
+ // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
+ // offsets are meaningful iff seekable is true.
+ //
+ // This field is initialized at creation time and is immutable.
+ seekable bool
+
+ // isTTY is true if this file represents a TTY.
+ //
+ // This field is initialized at creation time and is immutable.
+ isTTY bool
+
+ // savable is true if hostFD may be saved/restored by its numeric value.
+ //
+ // This field is initialized at creation time and is immutable.
+ savable bool
+
+ // Event queue for blocking operations.
+ queue waiter.Queue
+
+ // mapsMu protects mappings.
+ mapsMu sync.Mutex `state:"nosave"`
+
+ // If this file is mmappable, mappings tracks mappings of hostFD into
+ // memmap.MappingSpaces.
+ mappings memmap.MappingSet
+
+ // pf implements platform.File for mappings of hostFD.
+ pf inodePlatformFile
+
+ // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
+ // read from the pipe from previous calls to inode.beforeSave(). haveBuf
+ // and buf are protected by bufMu. haveBuf is accessed using atomic memory
+ // operations.
+ bufMu sync.Mutex `state:"nosave"`
+ haveBuf uint32
+ buf []byte
+}
+
+func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) {
+ // Determine if hostFD is seekable.
+ _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+ seekable := err != syserror.ESPIPE
+ // We expect regular files to be seekable, as this is required for them to
+ // be memory-mappable.
+ if !seekable && fileType == syscall.S_IFREG {
+ ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
+ return nil, syserror.ESPIPE
+ }
+
+ i := &inode{
+ hostFD: hostFD,
+ ino: fs.NextIno(),
+ ftype: uint16(fileType),
+ mayBlock: fileType != syscall.S_IFREG && fileType != syscall.S_IFDIR,
+ seekable: seekable,
+ isTTY: isTTY,
+ savable: savable,
+ }
+ i.pf.inode = i
+ i.EnableLeakCheck()
+
+ // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
+ // handle blocking behavior in the sentry.
+ if i.mayBlock {
+ if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+ return nil, err
+ }
+ if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+ return nil, err
+ }
+ }
+ return i, nil
+}
+
// NewFDOptions contains options to NewFD.
type NewFDOptions struct {
+ // If Savable is true, the host file descriptor may be saved/restored by
+ // numeric value; the sandbox API requires a corresponding host FD with the
+ // same numeric value to be provieded at time of restore.
+ Savable bool
+
// If IsTTY is true, the file descriptor is a TTY.
IsTTY bool
@@ -75,73 +191,48 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
flags = uint32(flagsInt)
}
- fileMode := linux.FileMode(s.Mode)
- fileType := fileMode.FileType()
-
- // Determine if hostFD is seekable. If not, this syscall will return ESPIPE
- // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
- // devices.
- _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
- seekable := err != syserror.ESPIPE
-
- i := &inode{
- hostFD: hostFD,
- ino: fs.NextIno(),
- isTTY: opts.IsTTY,
- wouldBlock: wouldBlock(uint32(fileType)),
- seekable: seekable,
- canMap: canMap(uint32(fileType)),
- }
- i.pf.inode = i
-
- // Non-seekable files can't be memory mapped, assert this.
- if !i.seekable && i.canMap {
- panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
- }
-
- // If the hostFD would block, we must set it to non-blocking and handle
- // blocking behavior in the sentry.
- if i.wouldBlock {
- if err := syscall.SetNonblock(i.hostFD, true); err != nil {
- return nil, err
- }
- if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
- return nil, err
- }
- }
-
d := &kernfs.Dentry{}
- d.Init(i)
+ i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+ if err != nil {
+ return nil, err
+ }
+ d.Init(&fs.Filesystem, i)
// i.open will take a reference on d.
- defer d.DecRef()
+ defer d.DecRef(ctx)
// For simplicity, fileDescription.offset is set to 0. Technically, we
// should only set to 0 on files that are not seekable (sockets, pipes,
// etc.), and use the offset from the host fd otherwise when importing.
- return i.open(ctx, d.VFSDentry(), mnt, flags)
+ return i.open(ctx, d, mnt, flags)
}
// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
return NewFD(ctx, mnt, hostFD, &NewFDOptions{
- IsTTY: isTTY,
+ Savable: true,
+ IsTTY: isTTY,
})
}
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("host.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
func (filesystemType) Name() string {
return "none"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
// NewFilesystem sets up and returns a new hostfs filesystem.
//
// Note that there should only ever be one instance of host.filesystem,
@@ -159,15 +250,17 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
devMinor uint32
}
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
@@ -177,63 +270,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
return vfs.PrependPathSyntheticError{}
}
-// inode implements kernfs.Inode.
-type inode struct {
- kernfs.InodeNotDirectory
- kernfs.InodeNotSymlink
-
- // When the reference count reaches zero, the host fd is closed.
- refs.AtomicRefCount
-
- // hostFD contains the host fd that this file was originally created from,
- // which must be available at time of restore.
- //
- // This field is initialized at creation time and is immutable.
- hostFD int
-
- // ino is an inode number unique within this filesystem.
- //
- // This field is initialized at creation time and is immutable.
- ino uint64
-
- // isTTY is true if this file represents a TTY.
- //
- // This field is initialized at creation time and is immutable.
- isTTY bool
-
- // seekable is false if the host fd points to a file representing a stream,
- // e.g. a socket or a pipe. Such files are not seekable and can return
- // EWOULDBLOCK for I/O operations.
- //
- // This field is initialized at creation time and is immutable.
- seekable bool
-
- // wouldBlock is true if the host FD would return EWOULDBLOCK for
- // operations that would block.
- //
- // This field is initialized at creation time and is immutable.
- wouldBlock bool
-
- // Event queue for blocking operations.
- queue waiter.Queue
-
- // canMap specifies whether we allow the file to be memory mapped.
- //
- // This field is initialized at creation time and is immutable.
- canMap bool
-
- // mapsMu protects mappings.
- mapsMu sync.Mutex
-
- // If canMap is true, mappings tracks mappings of hostFD into
- // memmap.MappingSpaces.
- mappings memmap.MappingSet
-
- // pf implements platform.File for mappings of hostFD.
- pf inodePlatformFile
-}
-
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -242,7 +279,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
}
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
func (i *inode) Mode() linux.FileMode {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -253,8 +290,8 @@ func (i *inode) Mode() linux.FileMode {
return linux.FileMode(s.Mode)
}
-// Stat implements kernfs.Inode.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
if opts.Mask&linux.STATX__RESERVED != 0 {
return linux.Statx{}, syserror.EINVAL
}
@@ -366,9 +403,9 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
}, nil
}
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- s := opts.Stat
+ s := &opts.Stat
m := s.Mask
if m == 0 {
@@ -381,7 +418,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
return err
}
- if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
return err
}
@@ -391,6 +428,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
}
}
if m&linux.STATX_SIZE != 0 {
+ if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
+ return syserror.EINVAL
+ }
if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
return err
}
@@ -422,31 +462,28 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
return nil
}
-// DecRef implements kernfs.Inode.
-func (i *inode) DecRef() {
- i.AtomicRefCount.DecRefWithDestructor(i.Destroy)
-}
-
-// Destroy implements kernfs.Inode.
-func (i *inode) Destroy() {
- if i.wouldBlock {
- fdnotifier.RemoveFD(int32(i.hostFD))
- }
- if err := unix.Close(i.hostFD); err != nil {
- log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
- }
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+ i.inodeRefs.DecRef(func() {
+ if i.mayBlock {
+ fdnotifier.RemoveFD(int32(i.hostFD))
+ }
+ if err := unix.Close(i.hostFD); err != nil {
+ log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+ }
+ })
}
-// Open implements kernfs.Inode.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
if i.Mode().FileType() == linux.S_IFSOCK {
return nil, syserror.ENXIO
}
- return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+ return i.open(ctx, d, rp.Mount(), opts.Flags)
}
-func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
return nil, err
@@ -454,10 +491,12 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
fileType := s.Mode & linux.FileTypeMask
// Constrain flags to a subset we can handle.
- // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
- flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+ //
+ // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
+ flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
- if fileType == syscall.S_IFSOCK {
+ switch fileType {
+ case syscall.S_IFSOCK:
if i.isTTY {
log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
return nil, syserror.ENOTTY
@@ -468,35 +507,43 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
return nil, err
}
// Currently, we only allow Unix sockets to be imported.
- return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
- }
+ return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
- // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
- // we don't allow importing arbitrary file types without proper support.
- if i.isTTY {
- fd := &TTYFileDescription{
- fileDescription: fileDescription{inode: i},
- termios: linux.DefaultSlaveTermios,
+ case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
+ if i.isTTY {
+ fd := &TTYFileDescription{
+ fileDescription: fileDescription{inode: i},
+ termios: linux.DefaultReplicaTermios,
+ }
+ fd.LockFD.Init(&i.locks)
+ vfsfd := &fd.vfsfd
+ if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return vfsfd, nil
}
+
+ fd := &fileDescription{inode: i}
+ fd.LockFD.Init(&i.locks)
vfsfd := &fd.vfsfd
- if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return vfsfd, nil
- }
- fd := &fileDescription{inode: i}
- vfsfd := &fd.vfsfd
- if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
+ default:
+ log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
+ return nil, syserror.EPERM
}
- return vfsfd, nil
}
// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
// cached to reduce indirections and casting. fileDescription does not hold
@@ -507,31 +554,43 @@ type fileDescription struct {
inode *inode
// offsetMu protects offset.
- offsetMu sync.Mutex
+ offsetMu sync.Mutex `state:"nosave"`
// offset specifies the current file offset. It is only meaningful when
// inode.seekable is true.
offset int64
}
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
}
-// Stat implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts)
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
}
-// Release implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Release() {
+// Release implements vfs.FileDescriptionImpl.Release.
+func (f *fileDescription) Release(context.Context) {
// noop
}
-// PRead implements FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
i := f.inode
if !i.seekable {
return 0, syserror.ESPIPE
@@ -540,23 +599,35 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
}
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
i := f.inode
if !i.seekable {
+ bufN, err := i.readFromBuf(ctx, &dst)
+ if err != nil {
+ return bufN, err
+ }
n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+ total := bufN + n
if isBlockError(err) {
// If we got any data at all, return it as a "completed" partial read
// rather than retrying until complete.
- if n != 0 {
+ if total != 0 {
err = nil
} else {
err = syserror.ErrWouldBlock
}
}
- return n, err
+ return total, err
}
- // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+
f.offsetMu.Lock()
n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
f.offset += n
@@ -564,58 +635,90 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
return n, err
}
-func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
- // TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
- if flags != 0 {
- return 0, syserror.EOPNOTSUPP
+func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
+ if atomic.LoadUint32(&i.haveBuf) == 0 {
+ return 0, nil
}
+ i.bufMu.Lock()
+ defer i.bufMu.Unlock()
+ if len(i.buf) == 0 {
+ return 0, nil
+ }
+ n, err := dst.CopyOut(ctx, i.buf)
+ *dst = dst.DropFirst(n)
+ i.buf = i.buf[n:]
+ if len(i.buf) == 0 {
+ atomic.StoreUint32(&i.haveBuf, 0)
+ i.buf = nil
+ }
+ return int64(n), err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
n, err := dst.CopyOutFrom(ctx, reader)
hostfd.PutReadWriterAt(reader)
return int64(n), err
}
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- i := f.inode
- if !i.seekable {
+ if !f.inode.seekable {
return 0, syserror.ESPIPE
}
- return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+ return f.writeToHostFD(ctx, src, offset, opts.Flags)
}
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
i := f.inode
if !i.seekable {
- n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+ n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
if isBlockError(err) {
err = syserror.ErrWouldBlock
}
return n, err
}
- // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
- // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+
f.offsetMu.Lock()
- n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+ // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
+ // another process modifies the host file between retrieving the file size
+ // and writing to the host fd. This is an unavoidable race condition because
+ // we cannot enforce synchronization on the host.
+ if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.hostFD, &s); err != nil {
+ f.offsetMu.Unlock()
+ return 0, err
+ }
+ f.offset = s.Size
+ }
+ n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
f.offset += n
f.offsetMu.Unlock()
return n, err
}
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
- // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+ hostFD := f.inode.hostFD
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
if flags != 0 {
return 0, syserror.EOPNOTSUPP
}
writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
n, err := src.CopyInTo(ctx, writer)
hostfd.PutReadWriterAt(writer)
+ // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
+ if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ if syncErr := unix.Fsync(hostFD); syncErr != nil {
+ return int64(n), syncErr
+ }
+ }
return int64(n), err
}
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
//
// Note that we do not support seeking on directories, since we do not even
// allow directory fds to be imported at all.
@@ -680,15 +783,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
return f.offset, nil
}
-// Sync implements FileDescriptionImpl.
-func (f *fileDescription) Sync(context.Context) error {
- // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (f *fileDescription) Sync(ctx context.Context) error {
+ // TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
return unix.Fsync(f.inode.hostFD)
}
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
- if !f.inode.canMap {
+ // NOTE(b/38213152): Technically, some obscure char devices can be memory
+ // mapped, but we only allow regular files.
+ if f.inode.ftype != syscall.S_IFREG {
return syserror.ENODEV
}
i := f.inode
@@ -699,16 +804,30 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
// EventRegister implements waiter.Waitable.EventRegister.
func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
f.inode.queue.EventRegister(e, mask)
- fdnotifier.UpdateFD(int32(f.inode.hostFD))
+ if f.inode.mayBlock {
+ fdnotifier.UpdateFD(int32(f.inode.hostFD))
+ }
}
// EventUnregister implements waiter.Waitable.EventUnregister.
func (f *fileDescription) EventUnregister(e *waiter.Entry) {
f.inode.queue.EventUnregister(e)
- fdnotifier.UpdateFD(int32(f.inode.hostFD))
+ if f.inode.mayBlock {
+ fdnotifier.UpdateFD(int32(f.inode.hostFD))
+ }
}
// Readiness uses the poll() syscall to check the status of the underlying FD.
func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 8545a82f0..3d7eb2f96 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -19,22 +19,23 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
-// inodePlatformFile implements platform.File. It exists solely because inode
-// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+// inodePlatformFile implements memmap.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
//
// inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
type inodePlatformFile struct {
*inode
// fdRefsMu protects fdRefs.
- fdRefsMu sync.Mutex
+ fdRefsMu sync.Mutex `state:"nosave"`
- // fdRefs counts references on platform.File offsets. It is used solely for
+ // fdRefs counts references on memmap.File offsets. It is used solely for
// memory accounting.
fdRefs fsutil.FrameRefSet
@@ -42,35 +43,35 @@ type inodePlatformFile struct {
fileMapper fsutil.HostFileMapper
// fileMapperInitOnce is used to lazily initialize fileMapper.
- fileMapperInitOnce sync.Once
+ fileMapperInitOnce sync.Once `state:"nosave"`
}
-// IncRef implements platform.File.IncRef.
+// IncRef implements memmap.File.IncRef.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+func (i *inodePlatformFile) IncRef(fr memmap.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.IncRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
-// DecRef implements platform.File.DecRef.
+// DecRef implements memmap.File.DecRef.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+func (i *inodePlatformFile) DecRef(fr memmap.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.DecRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal.
+// MapInternal implements memmap.File.MapInternal.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (i *inodePlatformFile) FD() int {
return i.hostFD
}
diff --git a/pkg/sentry/fsimpl/host/save_restore.go b/pkg/sentry/fsimpl/host/save_restore.go
new file mode 100644
index 000000000..7e32a8863
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/save_restore.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "io"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/hostfd"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inode) beforeSave() {
+ if !i.savable {
+ panic("host.inode is not savable")
+ }
+ if i.ftype == syscall.S_IFIFO {
+ // If this pipe FD is readable, drain it so that bytes in the pipe can
+ // be read after restore. (This is a legacy VFS1 feature.) We don't
+ // know if the pipe FD is readable, so just try reading and tolerate
+ // EBADF from the read.
+ i.bufMu.Lock()
+ defer i.bufMu.Unlock()
+ var buf [usermem.PageSize]byte
+ for {
+ n, err := hostfd.Preadv2(int32(i.hostFD), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), -1 /* offset */, 0 /* flags */)
+ if n != 0 {
+ i.buf = append(i.buf, buf[:n]...)
+ }
+ if err != nil {
+ if err == io.EOF || err == syscall.EAGAIN || err == syscall.EBADF {
+ break
+ }
+ panic(fmt.Errorf("host.inode.beforeSave: buffering from pipe failed: %v", err))
+ }
+ }
+ if len(i.buf) != 0 {
+ atomic.StoreUint32(&i.haveBuf, 1)
+ }
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (i *inode) afterLoad() {
+ if i.mayBlock {
+ if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+ panic(fmt.Sprintf("host.inode.afterLoad: failed to set host FD %d non-blocking: %v", i.hostFD, err))
+ }
+ if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+ panic(fmt.Sprintf("host.inode.afterLoad: fdnotifier.AddFD(%d) failed: %v", i.hostFD, err))
+ }
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodePlatformFile) afterLoad() {
+ if i.fileMapper.IsInited() {
+ // Ensure that we don't call i.fileMapper.Init() again.
+ i.fileMapperInitOnce.Do(func() {})
+ }
+}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 38f1fbfba..8a447e29f 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -47,11 +46,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
return ep, nil
}
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
// transport.Receiver. It is backed by a host fd that was imported at sentry
// startup. This fd is shared with a hostfs inode, which retains ownership of
@@ -64,8 +58,7 @@ const maxSendBufferSize = 8 << 20
//
// +stateify savable
type ConnectedEndpoint struct {
- // ref keeps track of references to a ConnectedEndpoint.
- ref refs.AtomicRefCount
+ ConnectedEndpointRefs
// mu protects fd below.
mu sync.RWMutex `state:"nosave"`
@@ -114,10 +107,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
if err != nil {
return syserr.FromError(err)
}
- if sndbuf > maxSendBufferSize {
- log.Warningf("Socket send buffer too large: %d", sndbuf)
- return syserr.ErrInvalidEndpointState
- }
c.stype = linux.SockType(stype)
c.sndbuf = int64(sndbuf)
@@ -141,14 +130,14 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
return nil, err
}
- // AtomicRefCounters start off with a single reference. We need two.
- e.ref.IncRef()
- e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+ // ConnectedEndpointRefs start off with a single reference. We need two.
+ e.IncRef()
+ e.EnableLeakCheck()
return &e, nil
}
// Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -225,7 +214,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
}
// Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -326,8 +315,8 @@ func (c *ConnectedEndpoint) destroyLocked() {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
-func (c *ConnectedEndpoint) Release() {
- c.ref.DecRefWithDestructor(func() {
+func (c *ConnectedEndpoint) Release(ctx context.Context) {
+ c.DecRef(func() {
c.mu.Lock()
c.destroyLocked()
c.mu.Unlock()
@@ -356,13 +345,13 @@ func (e *SCMConnectedEndpoint) Init() error {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
-func (e *SCMConnectedEndpoint) Release() {
- e.ref.DecRefWithDestructor(func() {
+func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
+ e.DecRef(func() {
e.mu.Lock()
+ fdnotifier.RemoveFD(int32(e.fd))
if err := syscall.Close(e.fd); err != nil {
log.Warningf("Failed to close host fd %d: %v", err)
}
- fdnotifier.RemoveFD(int32(e.fd))
e.destroyLocked()
e.mu.Unlock()
})
@@ -387,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
return nil, err
}
- // AtomicRefCounters start off with a single reference. We need two.
- e.ref.IncRef()
- e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+ // ConnectedEndpointRefs start off with a single reference. We need two.
+ e.IncRef()
+ e.EnableLeakCheck()
return &e, nil
}
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
index 584c247d2..fc0d5fd38 100644
--- a/pkg/sentry/fsimpl/host/socket_iovec.go
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -17,13 +17,10 @@ package host
import (
"syscall"
- "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/iovec"
"gvisor.dev/gvisor/pkg/syserror"
)
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
// copyToMulti copies as many bytes from src to dst as possible.
func copyToMulti(dst [][]byte, src []byte) {
for _, d := range dst {
@@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
}
}
- if iovsRequired > maxIovs {
+ if iovsRequired > iovec.MaxIovs {
// The kernel will reject our call if we pass this many iovs.
// Use a single intermediate buffer instead.
b := make([]byte, stopLen)
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 68af6e5af..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,7 +17,9 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -28,6 +30,8 @@ import (
// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
// descriptor that wraps a TTY FD.
+//
+// +stateify savable
type TTYFileDescription struct {
fileDescription
@@ -66,15 +70,15 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup {
}
// Release implements fs.FileOperations.Release.
-func (t *TTYFileDescription) Release() {
+func (t *TTYFileDescription) Release(ctx context.Context) {
t.mu.Lock()
t.fgProcessGroup = nil
t.mu.Unlock()
- t.fileDescription.Release()
+ t.fileDescription.Release(ctx)
}
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -92,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
return t.fileDescription.PRead(ctx, dst, offset, opts)
}
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -110,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
return t.fileDescription.Read(ctx, dst, opts)
}
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -125,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.PWrite(ctx, src, offset, opts)
}
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -140,8 +144,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.Write(ctx, src, opts)
}
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ return 0, syserror.ENOTTY
+ }
+
// Ignore arg[0]. This is the real FD:
fd := t.inode.hostFD
ioctl := args[1].Uint64()
@@ -151,9 +160,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = termios.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -165,9 +172,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
}
var termios linux.Termios
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetTermios(fd, ioctl, &termios)
@@ -191,10 +196,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
defer t.mu.Unlock()
// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
- pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+ _, err := pgID.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSPGRP:
@@ -202,11 +205,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
// Equivalent to tcsetpgrp(fd, *argp).
// Set the foreground process group ID of this terminal.
- task := kernel.TaskFromContext(ctx)
- if task == nil {
- return 0, syserror.ENOTTY
- }
-
t.mu.Lock()
defer t.mu.Unlock()
@@ -225,12 +223,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
return 0, syserror.ENOTTY
}
- var pgID kernel.ProcessGroupID
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgIDP primitive.Int32
+ if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
+ pgID := kernel.ProcessGroupID(pgIDP)
// pgID must be non-negative.
if pgID < 0 {
@@ -259,9 +256,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = winsize.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSWINSZ:
@@ -272,9 +267,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
// set the winsize.
var winsize linux.Winsize
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetWinsize(fd, &winsize)
@@ -325,9 +318,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
task := kernel.TaskFromContext(ctx)
if task == nil {
// No task? Linux does not have an analog for this case, but
- // tty_check_change is more of a blacklist of cases than a
- // whitelist, and is surprisingly permissive. Allowing the
- // change seems most appropriate.
+ // tty_check_change only blocks specific cases and is
+ // surprisingly permissive. Allowing the change seems
+ // appropriate.
return nil
}
@@ -375,5 +368,15 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
//
// Linux ignores the result of kill_pgrp().
_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
- return kernel.ERESTARTSYS
+ return syserror.ERESTARTSYS
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 2bc757b1a..b2f43a119 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -43,22 +43,6 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
}
-// wouldBlock returns true for file types that can return EWOULDBLOCK
-// for blocking operations, e.g. pipes, character devices, and sockets.
-func wouldBlock(fileType uint32) bool {
- return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
-}
-
-// canMap returns true if a file with fileType is allowed to be memory mapped.
-// This is ported over from VFS1, but it's probably not the best way for us
-// to check if a file can be memory mapped.
-func canMap(fileType uint32) bool {
- // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
- //
- // TODO(b/38213152): Some obscure character devices can be mapped.
- return fileType == syscall.S_IFREG
-}
-
// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
// If so, they can be transformed into syserror.ErrWouldBlock.
func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index ef34cb28a..aaad67ab8 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -4,6 +4,18 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "kernfs",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Dentry",
+ "Linker": "*Dentry",
+ },
+)
+
+go_template_instance(
name = "fstree",
out = "fstree.go",
package = "kernfs",
@@ -26,9 +38,54 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "static_directory_refs",
+ out = "static_directory_refs.go",
+ package = "kernfs",
+ prefix = "StaticDirectory",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "StaticDirectory",
+ },
+)
+
+go_template_instance(
+ name = "dir_refs",
+ out = "dir_refs.go",
+ package = "kernfs_test",
+ prefix = "dir",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "dir",
+ },
+)
+
+go_template_instance(
+ name = "readonly_dir_refs",
+ out = "readonly_dir_refs.go",
+ package = "kernfs_test",
+ prefix = "readonlyDir",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "readonlyDir",
+ },
+)
+
+go_template_instance(
+ name = "synthetic_directory_refs",
+ out = "synthetic_directory_refs.go",
+ package = "kernfs",
+ prefix = "syntheticDirectory",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "syntheticDirectory",
+ },
+)
+
go_library(
name = "kernfs",
srcs = [
+ "dentry_list.go",
"dynamic_bytes_file.go",
"fd_impl_util.go",
"filesystem.go",
@@ -36,7 +93,10 @@ go_library(
"inode_impl_util.go",
"kernfs.go",
"slot_list.go",
+ "static_directory_refs.go",
"symlink.go",
+ "synthetic_directory.go",
+ "synthetic_directory_refs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
@@ -45,7 +105,11 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refsvfs2",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
@@ -58,17 +122,24 @@ go_library(
go_test(
name = "kernfs_test",
size = "small",
- srcs = ["kernfs_test.go"],
+ srcs = [
+ "dir_refs.go",
+ "kernfs_test.go",
+ "readonly_dir_refs.go",
+ ],
deps = [
":kernfs",
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/testutil",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1568a9d49..485504995 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -34,28 +35,30 @@ import (
// +stateify savable
type DynamicBytesFile struct {
InodeAttrs
+ InodeNoStatFS
InodeNoopRefCount
InodeNotDirectory
InodeNotSymlink
- data vfs.DynamicBytesSource
+ locks vfs.FileLocks
+ data vfs.DynamicBytesSource
}
var _ Inode = (*DynamicBytesFile)(nil)
// Init initializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
if perm&^linux.PermissionsMask != 0 {
panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
}
- f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+ f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
f.data = data
}
// Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &DynamicBytesFD{}
- if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
+ if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -77,17 +80,19 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
type DynamicBytesFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DynamicBytesFileDescriptionImpl
+ vfs.LockFD
vfsfd vfs.FileDescription
inode Inode
}
// Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
- if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+ fd.LockFD.Init(locks)
+ if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return err
}
- fd.inode = d.Impl().(*Dentry).inode
+ fd.inode = d.inode
fd.SetDataSource(data)
return nil
}
@@ -97,12 +102,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32)
return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
}
-// Read implmenets vfs.FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
}
-// PRead implmenets vfs.FileDescriptionImpl.PRead.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
}
@@ -118,12 +123,12 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DynamicBytesFD) Release() {}
+func (fd *DynamicBytesFD) Release(context.Context) {}
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
- return fd.inode.Stat(fs, opts)
+ return fd.inode.Stat(ctx, fs, opts)
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -131,3 +136,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
// DynamicBytesFiles are immutable.
return syserror.EPERM
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8284e76a7..f8dae22f8 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,10 +15,11 @@
package kernfs
import (
- "math"
+ "fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -27,9 +28,29 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+ // Consider the end of the file to be after the final static entry. This is
+ // the default option.
+ SeekEndStaticEntries = iota
+ // Consider the end of the file to be at offset 0.
+ SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
+type GenericDirectoryFDOptions struct {
+ SeekEnd SeekEndConfig
+}
+
// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
//
// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
// IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -39,15 +60,21 @@ import (
// Must be initialize with Init before first use.
//
// Lock ordering: mu => children.mu.
+//
+// +stateify savable
type GenericDirectoryFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DirectoryFileDescriptionDefaultImpl
+ vfs.LockFD
+
+ // Immutable.
+ seekEnd SeekEndConfig
vfsfd vfs.FileDescription
children *OrderedChildren
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// off is the current directory offset. Protected by "mu".
off int64
@@ -55,12 +82,12 @@ type GenericDirectoryFD struct {
// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
// dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
fd := &GenericDirectoryFD{}
- if err := fd.Init(children, opts); err != nil {
+ if err := fd.Init(children, locks, opts, fdOpts); err != nil {
return nil, err
}
- if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return fd, nil
@@ -69,11 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
// Init initializes a GenericDirectoryFD. Use it when overriding
// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
// correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
// Can't open directories for writing.
return syserror.EISDIR
}
+ fd.LockFD.Init(locks)
+ fd.seekEnd = fdOpts.SeekEnd
fd.children = children
return nil
}
@@ -109,18 +138,22 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence
return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
}
-// Release implements vfs.FileDecriptionImpl.Release.
-func (fd *GenericDirectoryFD) Release() {}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release(context.Context) {}
func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
return fd.vfsfd.VirtualDentry().Mount().Filesystem()
}
+func (fd *GenericDirectoryFD) dentry() *Dentry {
+ return fd.vfsfd.Dentry().Impl().(*Dentry)
+}
+
func (fd *GenericDirectoryFD) inode() Inode {
- return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+ return fd.dentry().inode
}
-// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
// o.mu when calling cb.
func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fd.mu.Lock()
@@ -129,7 +162,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
opts := vfs.StatOptions{Mask: linux.STATX_INO}
// Handle ".".
if fd.off == 0 {
- stat, err := fd.inode().Stat(fd.filesystem(), opts)
+ stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
@@ -147,9 +180,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
// Handle "..".
if fd.off == 1 {
- vfsd := fd.vfsfd.VirtualDentry().Dentry()
- parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
- stat, err := parentInode.Stat(fd.filesystem(), opts)
+ parentInode := genericParentOrSelf(fd.dentry()).inode
+ stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
@@ -172,13 +204,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
// these.
childIdx := fd.off - 2
for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
- inode := it.Dentry.Impl().(*Dentry).inode
- stat, err := inode.Stat(fd.filesystem(), opts)
+ stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
dirent := vfs.Dirent{
- Name: it.Name,
+ Name: it.name,
Type: linux.FileMode(stat.Mode).DirentType(),
Ino: stat.Ino,
NextOff: fd.off + 1,
@@ -191,11 +222,11 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
var err error
relOffset := fd.off - int64(len(fd.children.set)) - 2
- fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+ fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset)
return err
}
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
@@ -206,9 +237,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
case linux.SEEK_CUR:
offset += fd.off
case linux.SEEK_END:
- // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
- // if they are added after SEEK_END.
- offset = math.MaxInt64
+ switch fd.seekEnd {
+ case SeekEndStaticEntries:
+ fd.children.mu.RLock()
+ offset += int64(len(fd.children.set))
+ offset += 2 // '.' and '..' aren't tracked in children.
+ fd.children.mu.RUnlock()
+ case SeekEndZero:
+ // No-op: offset += 0.
+ default:
+ panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+ }
default:
return 0, syserror.EINVAL
}
@@ -223,12 +262,26 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := fd.filesystem()
inode := fd.inode()
- return inode.Stat(fs, opts)
+ return inode.Stat(ctx, fs, opts)
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
- inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
- return inode.SetStat(ctx, fd.filesystem(), creds, opts)
+ return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4a12ae245..399895f3e 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,11 +32,12 @@ import (
//
// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
- d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -53,20 +54,20 @@ afterSymlink:
// calls d_revalidate(), but walk_component() => handle_dots() does not.
if name == "." {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
return nil, err
}
rp.Advance()
- return &d.parent.vfsd, nil
+ return d.parent, nil
}
if len(name) > linux.NAME_MAX {
return nil, syserror.ENAMETOOLONG
@@ -77,18 +78,18 @@ afterSymlink:
if err != nil {
return nil, err
}
- if err := rp.CheckMount(&next.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
return nil, err
}
// Resolve any symlink at current path component.
- if rp.ShouldFollowSymlink() && next.isSymlink() {
+ if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
if err != nil {
return nil, err
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef()
+ fs.deferDecRefVD(ctx, targetVD)
if err != nil {
return nil, err
}
@@ -100,15 +101,18 @@ afterSymlink:
goto afterSymlink
}
rp.Advance()
- return &next.vfsd, nil
+ return next, nil
}
// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
// nil) to verify that the returned child (or lack thereof) is correct.
//
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// Cached dentry exists, revalidate.
if !child.inode.Valid(ctx) {
delete(parent.children, name)
- vfsObj.InvalidateDentry(&child.vfsd)
- fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+ if child.inode.Keep() {
+ // Drop the ref owned by kernfs.
+ fs.deferDecRef(child)
+ }
+ vfsObj.InvalidateDentry(ctx, child.VFSDentry())
child = nil
}
}
if child == nil {
- // Dentry isn't cached; it either doesn't exist or failed
- // revalidation. Attempt to resolve it via Lookup.
- //
- // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
- // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
- // that all dentries in the filesystem are (kernfs.)Dentry and performs
- // vfs.DentryImpl casts accordingly.
- childVFSD, err := parent.inode.Lookup(ctx, name)
+ // Dentry isn't cached; it either doesn't exist or failed revalidation.
+ // Attempt to resolve it via Lookup.
+ childInode, err := parent.inode.Lookup(ctx, name)
if err != nil {
return nil, err
}
- // Reference on childVFSD dropped by a corresponding Valid.
- child = childVFSD.Impl().(*Dentry)
- parent.insertChildLocked(name, child)
+ var newChild Dentry
+ newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
+ parent.insertChildLocked(name, &newChild)
+ child = &newChild
+
+ // Drop the ref on newChild. This will cause the dentry to get pruned
+ // from the dentry tree by the end of current filesystem operation
+ // (before returning to the VFS layer) if another ref is not picked on
+ // this dentry.
+ if !childInode.Keep() {
+ fs.deferDecRef(&newChild)
+ }
}
return child, nil
}
@@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// Preconditions: Filesystem.mu must be locked for at least reading.
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
- vfsd := rp.Start()
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+ d := rp.Start().Impl().(*Dentry)
for !rp.Done() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
- return nil, nil, err
+ return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if rp.MustBeDir() && !d.isDir() {
- return nil, nil, syserror.ENOTDIR
+ return nil, syserror.ENOTDIR
}
- return vfsd, d.inode, nil
+ return d, nil
}
// walkParentDirLocked resolves all but the last path component of rp to an
@@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
- vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+ d := rp.Start().Impl().(*Dentry)
for !rp.Final() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
- return nil, nil, err
+ return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
- return nil, nil, syserror.ENOTDIR
+ return nil, syserror.ENOTDIR
}
- return vfsd, d.inode, nil
+ return d, nil
}
// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// directory parent, then returns rp.Component().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return "", err
}
pc := rp.Component()
@@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
if len(pc) > linux.NAME_MAX {
return "", syserror.ENAMETOOLONG
}
- // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
- if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+ if _, ok := parent.children[pc]; ok {
return "", syserror.EEXIST
}
- if parentVFSD.IsDead() {
+ if parent.VFSDentry().IsDead() {
return "", syserror.ENOENT
}
return pc, nil
@@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
// checkDeleteLocked checks that the file represented by vfsd may be deleted.
//
// Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
- parent := vfsd.Impl().(*Dentry).parent
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
+ parent := d.parent
if parent == nil {
return syserror.EBUSY
}
@@ -234,7 +245,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *Filesystem) Release(context.Context) {
}
// Sync implements vfs.FilesystemImpl.Sync.
@@ -246,35 +257,35 @@ func (fs *Filesystem) Sync(ctx context.Context) error {
// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
- return inode.CheckPermissions(ctx, creds, ats)
+ return d.inode.CheckPermissions(ctx, creds, ats)
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
if opts.CheckSearchable {
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, syserror.ENOTDIR
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
}
+ vfsd := d.VFSDentry()
vfsd.IncRef() // Ownership transferred to caller.
return vfsd, nil
}
@@ -282,14 +293,14 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+ d, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return nil, err
}
- vfsd.IncRef() // Ownership transferred to caller.
- return vfsd, nil
+ d.IncRef() // Ownership transferred to caller.
+ return d.VFSDentry(), nil
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ parent, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EPERM
}
- childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+ childI, err := parent.inode.NewLink(ctx, pc, d.inode)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
@@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ parent, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+ childI, err := parent.inode.NewDir(ctx, pc, opts)
if err != nil {
- return err
+ if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+ return err
+ }
+ childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
@@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ parent, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+ newI, err := parent.inode.NewNode(ctx, pc, opts)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+ var newD Dentry
+ newD.Init(fs, newI)
+ parent.insertChildLocked(pc, &newD)
return nil
}
@@ -397,24 +426,41 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
- defer fs.mu.RUnlock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ defer fs.processDeferredDecRefs(ctx)
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return nil, err
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ fs.mu.RUnlock()
return nil, err
}
- return inode.Open(ctx, rp, vfsd, opts)
+ // Open may block so we need to unlock fs.mu. IncRef d to prevent
+ // its destruction while fs.mu is unlocked.
+ d.IncRef()
+ fs.mu.RUnlock()
+ fd, err := d.inode.Open(ctx, rp, d, opts)
+ d.DecRef(ctx)
+ return fd, err
}
// May create new file.
mustCreate := opts.Flags&linux.O_EXCL != 0
- vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ d := rp.Start().Impl().(*Dentry)
fs.mu.Lock()
- defer fs.mu.Unlock()
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.mu.Unlock()
+ unlocked = true
+ }
+ }
+ // Process all to-be-decref'd dentries at the end at once.
+ // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
+ // when this is executed.
+ defer fs.processDeferredDecRefs(ctx)
+ defer unlock()
if rp.Done() {
if rp.MustBeDir() {
return nil, syserror.EISDIR
@@ -422,19 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- return inode.Open(ctx, rp, vfsd, opts)
+ // Open may block so we need to unlock fs.mu. IncRef d to prevent
+ // its destruction while fs.mu is unlocked.
+ d.IncRef()
+ unlock()
+ fd, err := d.inode.Open(ctx, rp, d, opts)
+ d.DecRef(ctx)
+ return fd, err
}
afterTrailingSymlink:
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ parent, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return nil, err
}
// Check for search permission in the parent directory.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
// Reject attempts to open directories with O_CREAT.
@@ -449,10 +500,10 @@ afterTrailingSymlink:
return nil, syserror.ENAMETOOLONG
}
// Determine whether or not we need to create a file.
- childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+ child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
if err == syserror.ENOENT {
// Already checked for searchability above; now check for writability.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -460,13 +511,20 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+ childI, err := parent.inode.NewFile(ctx, pc, opts)
if err != nil {
return nil, err
}
- child := childVFSD.Impl().(*Dentry)
- parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
- return child.inode.Open(ctx, rp, childVFSD, opts)
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChild(pc, &child)
+ // Open may block so we need to unlock fs.mu. IncRef child to prevent
+ // its destruction while fs.mu is unlocked.
+ child.IncRef()
+ unlock()
+ fd, err := child.inode.Open(ctx, rp, &child, opts)
+ child.DecRef(ctx)
+ return fd, err
}
if err != nil {
return nil, err
@@ -475,7 +533,6 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- child := childVFSD.Impl().(*Dentry)
if rp.ShouldFollowSymlink() && child.isSymlink() {
targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
if err != nil {
@@ -483,7 +540,7 @@ afterTrailingSymlink:
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef()
+ fs.deferDecRefVD(ctx, targetVD)
if err != nil {
return nil, err
}
@@ -499,22 +556,28 @@ afterTrailingSymlink:
if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- return child.inode.Open(ctx, rp, &child.vfsd, opts)
+ // Open may block so we need to unlock fs.mu. IncRef child to prevent
+ // its destruction while fs.mu is unlocked.
+ child.IncRef()
+ unlock()
+ fd, err := child.inode.Open(ctx, rp, child, opts)
+ child.DecRef(ctx)
+ return fd, err
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
- d, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return "", err
}
- if !d.Impl().(*Dentry).isSymlink() {
+ if !d.isSymlink() {
return "", syserror.EINVAL
}
- return inode.Readlink(ctx)
+ return d.inode.Readlink(ctx, rp.Mount())
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -526,16 +589,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
fs.mu.Lock()
- defer fs.processDeferredDecRefsLocked()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
// Resolve the destination directory first to verify that it's on this
// Mount.
- dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+ dstDir, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- dstDir := dstDirVFSD.Impl().(*Dentry)
mnt := rp.Mount()
if mnt != oldParentVD.Mount() {
return syserror.EXDEV
@@ -553,16 +615,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err != nil {
return err
}
- srcVFSD := &src.vfsd
// Can we remove the src dentry?
- if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+ if err := checkDeleteLocked(ctx, rp, src); err != nil {
return err
}
// Can we create the dst dentry?
var dst *Dentry
- pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+ pc, err := checkCreateLocked(ctx, rp, dstDir)
switch err {
case nil:
// Ok, continue with rename as replacement.
@@ -573,18 +634,18 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
dst = dstDir.children[pc]
if dst == nil {
- panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+ panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
}
default:
return err
}
var dstVFSD *vfs.Dentry
if dst != nil {
- dstVFSD = &dst.vfsd
+ dstVFSD = dst.VFSDentry()
}
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
virtfs := rp.VirtualFilesystem()
// We can't deadlock here due to lock ordering because we're protected from
@@ -596,35 +657,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
defer dstDir.dirMu.Unlock()
}
+ srcVFSD := src.VFSDentry()
if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
return err
}
- replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+ err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode)
if err != nil {
virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
return err
}
delete(srcDir.children, src.name)
if srcDir != dstDir {
- fs.deferDecRef(srcDirVFSD)
- dstDir.IncRef()
+ fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
+ dstDir.IncRef() // child (src) takes a ref on the new parent.
}
src.parent = dstDir
src.name = pc
if dstDir.children == nil {
dstDir.children = make(map[string]*Dentry)
}
+ replaced := dstDir.children[pc]
dstDir.children[pc] = src
- virtfs.CommitRenameReplaceDentry(srcVFSD, replaced)
+ var replaceVFSD *vfs.Dentry
+ if replaced != nil {
+ // deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
+ fs.deferDecRef(replaced)
+ replaceVFSD = replaced.VFSDentry()
+ }
+ virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
return nil
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
@@ -632,14 +702,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+ if err := checkDeleteLocked(ctx, rp, d); err != nil {
return err
}
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return syserror.ENOTDIR
}
- if inode.HasChildren() {
+ if d.inode.HasChildren() {
return syserror.ENOTEMPTY
}
virtfs := rp.VirtualFilesystem()
@@ -648,56 +717,60 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer parentDentry.dirMu.Unlock()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
+ vfsd := d.VFSDentry()
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+ if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
- virtfs.CommitDeleteDentry(vfsd)
+ delete(parentDentry.children, d.name)
+ // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+ fs.deferDecRef(d)
+ virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
if opts.Stat.Mask == 0 {
return nil
}
- return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+ return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return linux.Statx{}, err
}
- return inode.Stat(fs.VFSFilesystem(), opts)
+ return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return linux.Statfs{}, err
}
- // TODO(gvisor.dev/issue/1193): actually implement statfs.
- return linux.Statfs{}, syserror.ENOSYS
+ return d.inode.StatFS(ctx, fs.VFSFilesystem())
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -706,13 +779,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return syserror.EEXIST
}
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ parent, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -720,20 +796,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+ childI, err := parent.inode.NewSymlink(ctx, pc, target)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ var child Dentry
+ child.Init(fs, childI)
+ parent.insertChildLocked(pc, &child)
return nil
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.Unlock()
- vfsd, _, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
@@ -741,10 +820,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+ if err := checkDeleteLocked(ctx, rp, d); err != nil {
return err
}
- d := vfsd.Impl().(*Dentry)
if d.isDir() {
return syserror.EISDIR
}
@@ -753,39 +831,43 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
+ vfsd := d.VFSDentry()
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
- virtfs.CommitDeleteDentry(vfsd)
+ delete(parentDentry.children, d.name)
+ // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+ fs.deferDecRef(d)
+ virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
- _, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ d, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
- if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ _, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -793,12 +875,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ _, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return "", err
}
@@ -806,12 +888,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ _, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
@@ -819,12 +901,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
- fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
+ defer fs.mu.RUnlock()
+ _, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return err
}
@@ -838,3 +920,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
defer fs.mu.RUnlock()
return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
}
+
+func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
+ if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+ // The following is equivalent to vd.DecRef(ctx). This is needed
+ // because if d belongs to this filesystem, we can not DecRef it right
+ // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
+ // defer the DecRef to when locks are dropped.
+ vd.Mount().DecRef(ctx)
+ fs.deferDecRef(d)
+ } else {
+ vd.DecRef(ctx)
+ }
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 982daa2e6..d9d76758a 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,11 +20,12 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -32,7 +33,10 @@ import (
// count for inodes, performing no extra actions when references are obtained or
// released. This is suitable for simple file inodes that don't reference any
// resources.
+//
+// +stateify savable
type InodeNoopRefCount struct {
+ InodeTemporary
}
// IncRef implements Inode.IncRef.
@@ -40,7 +44,7 @@ func (InodeNoopRefCount) IncRef() {
}
// DecRef implements Inode.DecRef.
-func (InodeNoopRefCount) DecRef() {
+func (InodeNoopRefCount) DecRef(context.Context) {
}
// TryIncRef implements Inode.TryIncRef.
@@ -48,37 +52,35 @@ func (InodeNoopRefCount) TryIncRef() bool {
return true
}
-// Destroy implements Inode.Destroy.
-func (InodeNoopRefCount) Destroy() {
-}
-
// InodeDirectoryNoNewChildren partially implements the Inode interface.
// InodeDirectoryNoNewChildren represents a directory inode which does not
// support creation of new children.
+//
+// +stateify savable
type InodeDirectoryNoNewChildren struct{}
// NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
return nil, syserror.EPERM
}
// NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
return nil, syserror.EPERM
}
@@ -86,7 +88,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
// represent directories can embed this to provide no-op implementations for
// directory-related functions.
+//
+// +stateify savable
type InodeNotDirectory struct {
+ InodeAlwaysValid
}
// HasChildren implements Inode.HasChildren.
@@ -95,89 +100,64 @@ func (InodeNotDirectory) HasChildren() bool {
}
// NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
panic("NewFile called on non-directory inode")
}
// NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
panic("NewDir called on non-directory inode")
}
// NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
panic("NewLink called on non-directory inode")
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
panic("NewSymlink called on non-directory inode")
}
// NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
panic("NewNode called on non-directory inode")
}
// Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
panic("Unlink called on non-directory inode")
}
// RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
panic("RmDir called on non-directory inode")
}
// Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
panic("Rename called on non-directory inode")
}
// Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
panic("Lookup called on non-directory inode")
}
// IterDirents implements Inode.IterDirents.
-func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
panic("IterDirents called on non-directory inode")
}
-// Valid implements Inode.Valid.
-func (InodeNotDirectory) Valid(context.Context) bool {
- return true
-}
-
-// InodeNoDynamicLookup partially implements the Inode interface, specifically
-// the inodeDynamicLookup sub interface. Directory inodes that do not support
-// dymanic entries (i.e. entries that are not "hashed" into the
-// vfs.Dentry.children) can embed this to provide no-op implementations for
-// functions related to dynamic entries.
-type InodeNoDynamicLookup struct{}
-
-// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
- return nil, syserror.ENOENT
-}
-
-// IterDirents implements Inode.IterDirents.
-func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
- return offset, nil
-}
-
-// Valid implements Inode.Valid.
-func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
- return true
-}
-
// InodeNotSymlink partially implements the Inode interface, specifically the
// inodeSymlink sub interface. All inodes that are not symlinks may embed this
// to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
type InodeNotSymlink struct{}
// Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
return "", syserror.EINVAL
}
@@ -191,18 +171,26 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
// inode attributes.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type InodeAttrs struct {
- devMajor uint32
- devMinor uint32
- ino uint64
- mode uint32
- uid uint32
- gid uint32
- nlink uint32
+ devMajor uint32
+ devMinor uint32
+ ino uint64
+ mode uint32
+ uid uint32
+ gid uint32
+ nlink uint32
+ blockSize uint32
+
+ // Timestamps, all nsecs from the Unix epoch.
+ atime int64
+ mtime int64
+ ctime int64
}
// Init initializes this InodeAttrs.
-func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
+func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
if mode.FileType() == 0 {
panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
}
@@ -218,6 +206,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, in
atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
atomic.StoreUint32(&a.nlink, nlink)
+ atomic.StoreUint32(&a.blockSize, usermem.PageSize)
+ now := ktime.NowFromContext(ctx).Nanoseconds()
+ atomic.StoreInt64(&a.atime, now)
+ atomic.StoreInt64(&a.mtime, now)
+ atomic.StoreInt64(&a.ctime, now)
}
// DevMajor returns the device major number.
@@ -240,12 +233,33 @@ func (a *InodeAttrs) Mode() linux.FileMode {
return linux.FileMode(atomic.LoadUint32(&a.mode))
}
+// TouchAtime updates a.atime to the current time.
+func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
+ if mnt.Flags.NoATime || mnt.ReadOnly() {
+ return
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return
+ }
+ atomic.StoreInt64(&a.atime, ktime.NowFromContext(ctx).Nanoseconds())
+ mnt.EndWrite()
+}
+
+// TouchCMtime updates a.{c/m}time to the current time. The caller should
+// synchronize calls to this so that ctime and mtime are updated to the same
+// value.
+func (a *InodeAttrs) TouchCMtime(ctx context.Context) {
+ now := ktime.NowFromContext(ctx).Nanoseconds()
+ atomic.StoreInt64(&a.mtime, now)
+ atomic.StoreInt64(&a.ctime, now)
+}
+
// Stat partially implements Inode.Stat. Note that this function doesn't provide
// all the stat fields, and the embedder should consider extending the result
// with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
- stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME
stat.DevMajor = a.devMajor
stat.DevMinor = a.devMinor
stat.Ino = atomic.LoadUint64(&a.ino)
@@ -253,9 +267,10 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error)
stat.UID = atomic.LoadUint32(&a.uid)
stat.GID = atomic.LoadUint32(&a.gid)
stat.Nlink = atomic.LoadUint32(&a.nlink)
-
- // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-
+ stat.Blksize = atomic.LoadUint32(&a.blockSize)
+ stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.atime))
+ stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.mtime))
+ stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.ctime))
return stat, nil
}
@@ -264,10 +279,18 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
if opts.Stat.Mask == 0 {
return nil
}
- if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+ // Note that not all fields are modifiable. For example, the file type and
+ // inode numbers are immutable after node creation. Setting the size is often
+ // allowed by kernfs files but does not do anything. If some other behavior is
+ // needed, the embedder should consider extending SetStat.
+ if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
return syserror.EPERM
}
- if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+ if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+ return syserror.EISDIR
+ }
+ if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
}
@@ -289,10 +312,19 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
atomic.StoreUint32(&a.gid, stat.GID)
}
- // Note that not all fields are modifiable. For example, the file type and
- // inode numbers are immutable after node creation.
-
- // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+ now := ktime.NowFromContext(ctx).Nanoseconds()
+ if stat.Mask&linux.STATX_ATIME != 0 {
+ if stat.Atime.Nsec == linux.UTIME_NOW {
+ stat.Atime = linux.NsecToStatxTimestamp(now)
+ }
+ atomic.StoreInt64(&a.atime, stat.Atime.ToNsec())
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 {
+ if stat.Mtime.Nsec == linux.UTIME_NOW {
+ stat.Mtime = linux.NsecToStatxTimestamp(now)
+ }
+ atomic.StoreInt64(&a.mtime, stat.Mtime.ToNsec())
+ }
return nil
}
@@ -323,13 +355,17 @@ func (a *InodeAttrs) DecLinks() {
}
}
+// +stateify savable
type slot struct {
- Name string
- Dentry *vfs.Dentry
+ name string
+ inode Inode
+ static bool
slotEntry
}
// OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
type OrderedChildrenOptions struct {
// Writable indicates whether vfs.FilesystemImpl methods implemented by
// OrderedChildren may modify the tracked children. This applies to
@@ -339,20 +375,28 @@ type OrderedChildrenOptions struct {
}
// OrderedChildren partially implements the Inode interface. OrderedChildren can
-// be embedded in directory inodes to keep track of the children in the
+// be embedded in directory inodes to keep track of children in the
// directory, and can then be used to implement a generic directory FD -- see
-// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
-// directories.
+// GenericDirectoryFD.
+//
+// OrderedChildren can represent a node in an Inode tree. The children inodes
+// might be directories themselves using OrderedChildren; hence extending the
+// tree. The parent inode (OrderedChildren user) holds a ref on all its static
+// children. This lets the static inodes outlive their associated dentry.
+// While the dentry might have to be regenerated via a Lookup() call, we can
+// keep reusing the same static inode. These static children inodes are finally
+// DecRef'd when this directory inode is being destroyed. This makes
+// OrderedChildren suitable for static directory entries as well.
//
// Must be initialize with Init before first use.
+//
+// +stateify savable
type OrderedChildren struct {
- refs.AtomicRefCount
-
// Can children be modified by user syscalls? It set to false, interface
// methods that would modify the children return EPERM. Immutable.
writable bool
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
order slotList
set map[string]*slot
}
@@ -363,39 +407,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
o.set = make(map[string]*slot)
}
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef() {
- o.AtomicRefCount.DecRefWithDestructor(o.Destroy)
-}
-
-// Destroy cleans up resources referenced by this OrderedChildren.
-func (o *OrderedChildren) Destroy() {
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy(ctx context.Context) {
o.mu.Lock()
defer o.mu.Unlock()
+ // Drop the ref that o owns on the static inodes it holds.
+ for _, s := range o.set {
+ if s.static {
+ s.inode.DecRef(ctx)
+ }
+ }
o.order.Reset()
o.set = nil
}
-// Populate inserts children into this OrderedChildren, and d's dentry
-// cache. Populate returns the number of directories inserted, which the caller
+// Populate inserts static children into this OrderedChildren.
+// Populate returns the number of directories inserted, which the caller
// may use to update the link count for the parent directory.
//
-// Precondition: d must represent a directory inode. children must not contain
-// any conflicting entries already in o.
-func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+// Precondition:
+// * d must represent a directory inode.
+// * children must not contain any conflicting entries already in o.
+// * Caller must hold a reference on all inodes passed.
+//
+// Postcondition: Caller's references on inodes are transferred to o.
+func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
var links uint32
for name, child := range children {
- if child.isDir() {
+ if child.Mode().IsDir() {
links++
}
- if err := o.Insert(name, child.VFSDentry()); err != nil {
- panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+ if err := o.insert(name, child, true); err != nil {
+ panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
}
- d.InsertChild(name, child)
}
return links
}
+// Lookup implements Inode.Lookup.
+func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
+ o.mu.RLock()
+ defer o.mu.RUnlock()
+
+ s, ok := o.set[name]
+ if !ok {
+ return nil, syserror.ENOENT
+ }
+
+ s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
+ return s.inode, nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+ // All entries from OrderedChildren have already been handled in
+ // GenericDirectoryFD.IterDirents.
+ return offset, nil
+}
+
// HasChildren implements Inode.HasChildren.
func (o *OrderedChildren) HasChildren() bool {
o.mu.RLock()
@@ -403,17 +474,27 @@ func (o *OrderedChildren) HasChildren() bool {
return len(o.set) > 0
}
-// Insert inserts child into o. This ignores the writability of o, as this is
-// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+// Insert inserts a dynamic child into o. This ignores the writability of o, as
+// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child Inode) error {
+ return o.insert(name, child, false)
+}
+
+// insert inserts child into o.
+//
+// Precondition: Caller must be holding a ref on child if static is true.
+//
+// Postcondition: Caller's ref on child is transferred to o if static is true.
+func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
o.mu.Lock()
defer o.mu.Unlock()
if _, ok := o.set[name]; ok {
return syserror.EEXIST
}
s := &slot{
- Name: name,
- Dentry: child,
+ name: name,
+ inode: child,
+ static: static,
}
o.order.PushBack(s)
o.set[name] = s
@@ -423,44 +504,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
// Precondition: caller must hold o.mu for writing.
func (o *OrderedChildren) removeLocked(name string) {
if s, ok := o.set[name]; ok {
+ if s.static {
+ panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
+ }
delete(o.set, name)
o.order.Remove(s)
}
}
// Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
if s, ok := o.set[name]; ok {
+ if s.static {
+ panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
+ }
+
// Existing slot with given name, simply replace the dentry.
- var old *vfs.Dentry
- old, s.Dentry = s.Dentry, new
- return old
+ s.inode = newI
}
// No existing slot with given name, create and hash new slot.
s := &slot{
- Name: name,
- Dentry: new,
+ name: name,
+ inode: newI,
+ static: false,
}
o.order.PushBack(s)
o.set[name] = s
- return nil
}
// Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
s, ok := o.set[name]
if !ok {
return syserror.ENOENT
}
- if s.Dentry != child {
- panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+ if s.inode != child {
+ panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
}
return nil
}
// Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
if !o.writable {
return syserror.EPERM
}
@@ -469,17 +555,20 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
if err := o.checkExistingLocked(name, child); err != nil {
return err
}
+
+ // TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
o.removeLocked(name)
return nil
}
-// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+// RmDir implements Inode.RmDir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
// We're not responsible for checking that child is a directory, that it's
// empty, or updating any link counts; so this is the same as unlink.
return o.Unlink(ctx, name, child)
}
+// +stateify savable
type renameAcrossDifferentImplementationsError struct{}
func (renameAcrossDifferentImplementationsError) Error() string {
@@ -495,13 +584,13 @@ func (renameAcrossDifferentImplementationsError) Error() string {
// that will support Rename.
//
// Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
- dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+ dst, ok := dstDir.(interface{}).(*OrderedChildren)
if !ok {
- return nil, renameAcrossDifferentImplementationsError{}
+ return renameAcrossDifferentImplementationsError{}
}
if !o.writable || !dst.writable {
- return nil, syserror.EPERM
+ return syserror.EPERM
}
// Note: There's a potential deadlock below if concurrent calls to Rename
// refer to the same src and dst directories in reverse. We avoid any
@@ -514,10 +603,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
defer dst.mu.Unlock()
}
if err := o.checkExistingLocked(oldname, child); err != nil {
- return nil, err
+ return err
}
- replaced := dst.replaceChildLocked(newname, child)
- return replaced, nil
+
+ // TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
+ dst.replaceChildLocked(ctx, newname, child)
+ return nil
}
// nthLocked returns an iterator to the nth child tracked by this object. The
@@ -536,12 +627,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
}
// InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
type InodeSymlink struct {
InodeNotDirectory
}
// Open implements Inode.Open.
-func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return nil, syserror.ELOOP
}
@@ -550,41 +643,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
//
// +stateify savable
type StaticDirectory struct {
- InodeNotSymlink
- InodeDirectoryNoNewChildren
+ InodeAlwaysValid
InodeAttrs
- InodeNoDynamicLookup
+ InodeDirectoryNoNewChildren
+ InodeNoStatFS
+ InodeNotSymlink
+ InodeTemporary
OrderedChildren
+ StaticDirectoryRefs
+
+ locks vfs.FileLocks
+ fdOpts GenericDirectoryFDOptions
}
var _ Inode = (*StaticDirectory)(nil)
// NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
inode := &StaticDirectory{}
- inode.Init(creds, devMajor, devMinor, ino, perm)
-
- dentry := &Dentry{}
- dentry.Init(inode)
+ inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
+ inode.EnableLeakCheck()
inode.OrderedChildren.Init(OrderedChildrenOptions{})
- links := inode.OrderedChildren.Populate(dentry, children)
+ links := inode.OrderedChildren.Populate(children)
inode.IncLinks(links)
- return dentry
+ return inode
}
// Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
if perm&^linux.PermissionsMask != 0 {
panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
}
- s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
+ s.fdOpts = fdOpts
+ s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}
-// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+// Open implements Inode.Open.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
if err != nil {
return nil, err
}
@@ -596,10 +694,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
return syserror.EPERM
}
-// AlwaysValid partially implements kernfs.inodeDynamicLookup.
-type AlwaysValid struct{}
+// DecRef implements Inode.DecRef.
+func (s *StaticDirectory) DecRef(ctx context.Context) {
+ s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
+}
+
+// InodeAlwaysValid partially implements Inode.
+//
+// +stateify savable
+type InodeAlwaysValid struct{}
-// Valid implements kernfs.inodeDynamicLookup.
-func (*AlwaysValid) Valid(context.Context) bool {
+// Valid implements Inode.Valid.
+func (*InodeAlwaysValid) Valid(context.Context) bool {
return true
}
+
+// InodeTemporary partially implements Inode.
+//
+// +stateify savable
+type InodeTemporary struct{}
+
+// Keep implements Inode.Keep.
+func (*InodeTemporary) Keep() bool {
+ return false
+}
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+//
+// +stateify savable
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index bbee8ccda..5c5e09ac5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,12 +29,16 @@
//
// Reference Model:
//
-// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// Kernfs dentries represents named pointers to inodes. Kernfs is solely
+// reponsible for maintaining and modifying its dentry tree; inode
+// implementations can not access the tree. Dentries and inodes have
// independent lifetimes and reference counts. A child dentry unconditionally
// holds a reference on its parent directory's dentry. A dentry also holds a
-// reference on the inode it points to. Multiple dentries can point to the same
-// inode (for example, in the case of hardlinks). File descriptors hold a
-// reference to the dentry they're opened on.
+// reference on the inode it points to (although that might not be the only
+// reference on the inode). Due to this inodes can outlive the dentries that
+// point to them. Multiple dentries can point to the same inode (for example,
+// in the case of hardlinks). File descriptors hold a reference to the dentry
+// they're opened on.
//
// Dentries are guaranteed to exist while holding Filesystem.mu for
// reading. Dropping dentries require holding Filesystem.mu for writing. To
@@ -47,8 +51,8 @@
// kernfs.Dentry.dirMu
// vfs.VirtualFilesystem.mountMu
// vfs.Dentry.mu
-// kernfs.Filesystem.droppedDentriesMu
// (inode implementation locks, if any)
+// kernfs.Filesystem.droppedDentriesMu
package kernfs
import (
@@ -57,7 +61,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -66,15 +69,17 @@ import (
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
// filesystem. Concrete implementations are expected to embed this in their own
// Filesystem type.
+//
+// +stateify savable
type Filesystem struct {
vfsfs vfs.Filesystem
- droppedDentriesMu sync.Mutex
+ droppedDentriesMu sync.Mutex `state:"nosave"`
// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
// used to defer dentry destruction until mu can be acquired for
// writing. Protected by droppedDentriesMu.
- droppedDentries []*vfs.Dentry
+ droppedDentries []*Dentry
// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
// for reading guarantees continued existence of any resolved dentries, but
@@ -93,22 +98,32 @@ type Filesystem struct {
// example:
//
// fs.mu.RLock()
- // fs.mu.processDeferredDecRefs()
+ // defer fs.processDeferredDecRefs()
// defer fs.mu.RUnlock()
// ...
// fs.deferDecRef(dentry)
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// nextInoMinusOne is used to to allocate inode numbers on this
// filesystem. Must be accessed by atomic operations.
nextInoMinusOne uint64
+
+ // cachedDentries contains all dentries with 0 references. (Due to race
+ // conditions, it may also contain dentries with non-zero references.)
+ // cachedDentriesLen is the number of dentries in cachedDentries. These
+ // fields are protected by mu.
+ cachedDentries dentryList
+ cachedDentriesLen uint64
+
+ // MaxCachedDentries is the maximum size of cachedDentries. If not set,
+ // defaults to 0 and kernfs does not cache any dentries. This is immutable.
+ MaxCachedDentries uint64
}
// deferDecRef defers dropping a dentry ref until the next call to
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
-//
-// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+// This may be called while Filesystem.mu or Dentry.dirMu is locked.
+func (fs *Filesystem) deferDecRef(d *Dentry) {
fs.droppedDentriesMu.Lock()
fs.droppedDentries = append(fs.droppedDentries, d)
fs.droppedDentriesMu.Unlock()
@@ -116,17 +131,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
// droppedDentries list. See comment on Filesystem.mu.
-func (fs *Filesystem) processDeferredDecRefs() {
- fs.mu.Lock()
- fs.processDeferredDecRefsLocked()
- fs.mu.Unlock()
-}
-
-// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked() {
+//
+// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
+func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
fs.droppedDentriesMu.Lock()
for _, d := range fs.droppedDentries {
- d.DecRef()
+ // Defer the DecRef call so that we are not holding droppedDentriesMu
+ // when DecRef is called.
+ defer d.DecRef(ctx)
}
fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
fs.droppedDentriesMu.Unlock()
@@ -155,15 +167,24 @@ const (
//
// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
// named reference to an inode. A dentry generally lives as long as it's part of
-// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
-// to them are removed. Dentries hold a single reference to the inode they point
+// a mounted filesystem tree. Kernfs drops dentries once all references to them
+// are dropped. Dentries hold a single reference to the inode they point
// to, and child dentries hold a reference on their parent.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type Dentry struct {
vfsd vfs.Dentry
- refs.AtomicRefCount
+ // refs is the reference count. When refs reaches 0, the dentry may be
+ // added to the cache or destroyed. If refs == -1, the dentry has already
+ // been destroyed. refs are allowed to go to 0 and increase again. refs is
+ // accessed using atomic memory operations.
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *Filesystem
// flags caches useful information about the dentry from the inode. See the
// dflags* consts above. Must be accessed by atomic ops.
@@ -172,21 +193,177 @@ type Dentry struct {
parent *Dentry
name string
+ // If cached is true, dentryEntry links dentry into
+ // Filesystem.cachedDentries. cached and dentryEntry are protected by
+ // Filesystem.mu.
+ cached bool
+ dentryEntry
+
// dirMu protects children and the names of child Dentries.
- dirMu sync.Mutex
+ //
+ // Note that holding fs.mu for writing is not sufficient;
+ // revalidateChildLocked(), which is a very hot path, may modify children with
+ // fs.mu acquired for reading only.
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*Dentry
inode Inode
}
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef() {
+ // d.refs may be 0 if d.fs.mu is locked, which serializes against
+ // d.cacheLocked().
+ atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&d.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(ctx context.Context) {
+ if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+ d.fs.mu.Lock()
+ d.cacheLocked(ctx)
+ d.fs.mu.Unlock()
+ } else if refs < 0 {
+ panic("kernfs.Dentry.DecRef() called without holding a reference")
+ }
+}
+
+// cacheLocked should be called after d's reference count becomes 0. The ref
+// count check may happen before acquiring d.fs.mu so there might be a race
+// condition where the ref count is increased again by the time the caller
+// acquires d.fs.mu. This race is handled.
+// Only reachable dentries are added to the cache. However, a dentry might
+// become unreachable *while* it is in the cache due to invalidation.
+//
+// Preconditions: d.fs.mu must be locked for writing.
+func (d *Dentry) cacheLocked(ctx context.Context) {
+ // Dentries with a non-zero reference count must be retained. (The only way
+ // to obtain a reference on a dentry with zero references is via path
+ // resolution, which requires d.fs.mu, so if d.refs is zero then it will
+ // remain zero while we hold d.fs.mu for writing.)
+ refs := atomic.LoadInt64(&d.refs)
+ if refs == -1 {
+ // Dentry has already been destroyed.
+ panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d))
+ }
+ if refs > 0 {
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.cached = false
+ }
+ return
+ }
+ // If the dentry is deleted and invalidated or has no parent, then it is no
+ // longer reachable by path resolution and should be dropped immediately
+ // because it has zero references.
+ // Note that a dentry may not always have a parent; for example magic links
+ // as described in Inode.Getlink.
+ if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil {
+ if !isDead {
+ d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
+ }
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.cached = false
+ }
+ d.destroyLocked(ctx)
+ return
+ }
+ // If d is already cached, just move it to the front of the LRU.
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentries.PushFront(d)
+ return
+ }
+ // Cache the dentry, then evict the least recently used cached dentry if
+ // the cache becomes over-full.
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cachedDentriesLen++
+ d.cached = true
+ if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
+ return
+ }
+ // Evict the least recently used dentry because cache size is greater than
+ // max cache size (configured on mount).
+ victim := d.fs.cachedDentries.Back()
+ d.fs.cachedDentries.Remove(victim)
+ d.fs.cachedDentriesLen--
+ victim.cached = false
+ // victim.refs may have become non-zero from an earlier path resolution
+ // after it was inserted into fs.cachedDentries.
+ if atomic.LoadInt64(&victim.refs) == 0 {
+ if !victim.vfsd.IsDead() {
+ victim.parent.dirMu.Lock()
+ // Note that victim can't be a mount point (in any mount
+ // namespace), since VFS holds references on mount points.
+ d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry())
+ delete(victim.parent.children, victim.name)
+ victim.parent.dirMu.Unlock()
+ }
+ victim.destroyLocked(ctx)
+ }
+ // Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+ // back down to fs.MaxCachedDentries, so we don't loop.
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.mu must be locked for writing.
+// * d.refs == 0.
+// * d should have been removed from d.parent.children, i.e. d is not reachable
+// by path traversal.
+// * d.vfsd.IsDead() is true.
+func (d *Dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("dentry.destroyLocked() called with references on the dentry")
+ }
+
+ d.inode.DecRef(ctx) // IncRef from Init.
+ d.inode = nil
+
+ // Drop the reference held by d on its parent without recursively locking
+ // d.fs.mu.
+ if d.parent != nil {
+ if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+ d.parent.cacheLocked(ctx)
+ } else if refs < 0 {
+ panic("kernfs.Dentry.DecRef() called without holding a reference")
+ }
+ }
+}
+
// Init initializes this dentry.
//
// Precondition: Caller must hold a reference on inode.
//
// Postcondition: Caller's reference on inode is transferred to the dentry.
-func (d *Dentry) Init(inode Inode) {
+func (d *Dentry) Init(fs *Filesystem, inode Inode) {
d.vfsd.Init(d)
+ d.fs = fs
d.inode = inode
+ atomic.StoreInt64(&d.refs, 1)
ftype := inode.Mode().FileType()
if ftype == linux.ModeDirectory {
d.flags |= dflagsIsDir
@@ -211,51 +388,44 @@ func (d *Dentry) isSymlink() bool {
return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
}
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef() {
- d.AtomicRefCount.DecRefWithDestructor(d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy() {
- d.inode.DecRef() // IncRef from Init.
- d.inode = nil
- if d.parent != nil {
- d.parent.DecRef() // IncRef from Dentry.InsertChild.
- }
-}
-
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
// Watches implements vfs.DentryImpl.Watches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
func (d *Dentry) Watches() *vfs.Watches {
return nil
}
-// InsertChild inserts child into the vfs dentry cache with the given name under
-// this dentry. This does not update the directory inode, so calling this on
-// its own isn't sufficient to insert a child into a directory. InsertChild
-// updates the link count on d if required.
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches(context.Context) {}
+
+// insertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on its
+// own isn't sufficient to insert a child into a directory.
//
-// Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *Dentry) {
+// Preconditions:
+// * d must represent a directory inode.
+// * d.fs.mu must be locked for at least reading.
+func (d *Dentry) insertChild(name string, child *Dentry) {
d.dirMu.Lock()
d.insertChildLocked(name, child)
d.dirMu.Unlock()
}
-// insertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to insertChild, with additional
// preconditions.
//
-// Precondition: d.dirMu must be locked.
+// Preconditions:
+// * d must represent a directory inode.
+// * d.dirMu must be locked.
+// * d.fs.mu must be locked for at least reading.
func (d *Dentry) insertChildLocked(name string, child *Dentry) {
if !d.isDir() {
- panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+ panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
}
d.IncRef() // DecRef in child's Dentry.destroy.
child.parent = d
@@ -286,7 +456,6 @@ func (d *Dentry) Inode() Inode {
//
// - Checking that dentries passed to methods are of the appropriate file type.
// - Checking permissions.
-// - Updating link and reference counts.
//
// Specific responsibilities of implementations are documented below.
type Inode interface {
@@ -296,7 +465,8 @@ type Inode interface {
inodeRefs
// Methods related to node metadata. A generic implementation is provided by
- // InodeAttrs.
+ // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
+ // managing link counts.
inodeMetadata
// Method for inodes that represent symlink. InodeNotSymlink provides a
@@ -307,28 +477,32 @@ type Inode interface {
// a blanket implementation for all non-directory inodes.
inodeDirectory
- // Method for inodes that represent dynamic directories and their
- // children. InodeNoDynamicLookup provides a blanket implementation for all
- // non-dynamic-directory inodes.
- inodeDynamicLookup
-
// Open creates a file description for the filesystem object represented by
// this inode. The returned file description should hold a reference on the
- // inode for its lifetime.
+ // dentry for its lifetime.
//
// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
// the inode on which Open() is being called.
- Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+ Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+ // StatFS returns filesystem statistics for the client filesystem. This
+ // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+ // doesn't support statfs(2), this should return ENOSYS.
+ StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
+
+ // Keep indicates whether the dentry created after Inode.Lookup should be
+ // kept in the kernfs dentry tree.
+ Keep() bool
+
+ // Valid should return true if this inode is still valid, or needs to
+ // be resolved again by a call to Lookup.
+ Valid(ctx context.Context) bool
}
type inodeRefs interface {
IncRef()
- DecRef()
+ DecRef(ctx context.Context)
TryIncRef() bool
- // Destroy is called when the inode reaches zero references. Destroy release
- // all resources (references) on objects referenced by the inode, including
- // any child dentries.
- Destroy()
}
type inodeMetadata interface {
@@ -343,7 +517,7 @@ type inodeMetadata interface {
// Stat returns the metadata for this inode. This corresponds to
// vfs.FilesystemImpl.StatAt.
- Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+ Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
// SetStat updates the metadata for this inode. This corresponds to
// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
@@ -355,8 +529,8 @@ type inodeMetadata interface {
// Precondition: All methods in this interface may only be called on directory
// inodes.
type inodeDirectory interface {
- // The New{File,Dir,Node,Symlink} methods below should return a new inode
- // hashed into this inode.
+ // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
+ // that will be hashed into the dentry tree.
//
// These inode constructors are inode-level operations rather than
// filesystem-level operations to allow client filesystems to mix different
@@ -367,75 +541,69 @@ type inodeDirectory interface {
HasChildren() bool
// NewFile creates a new regular file inode.
- NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+ NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
// NewDir creates a new directory inode.
- NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+ NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
// NewLink creates a new hardlink to a specified inode in this
// directory. Implementations should create a new kernfs Dentry pointing to
// target, and update target's link count.
- NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+ NewLink(ctx context.Context, name string, target Inode) (Inode, error)
// NewSymlink creates a new symbolic link inode.
- NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+ NewSymlink(ctx context.Context, name, target string) (Inode, error)
// NewNode creates a new filesystem node for a mknod syscall.
- NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+ NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
// Unlink removes a child dentry from this directory inode.
- Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+ Unlink(ctx context.Context, name string, child Inode) error
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
// is a directory, checking for an empty directory.
- RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+ RmDir(ctx context.Context, name string, child Inode) error
// Rename is called on the source directory containing an inode being
// renamed. child should point to the resolved child in the source
- // directory. If Rename replaces a dentry in the destination directory, it
- // should return the replaced dentry or nil otherwise.
+ // directory.
//
// Precondition: Caller must serialize concurrent calls to Rename.
- Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
-}
+ Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
-type inodeDynamicLookup interface {
- // Lookup should return an appropriate dentry if name should resolve to a
- // child of this dynamic directory inode. This gives the directory an
- // opportunity on every lookup to resolve additional entries that aren't
- // hashed into the directory. This is only called when the inode is a
- // directory. If the inode is not a directory, or if the directory only
- // contains a static set of children, the implementer can unconditionally
- // return an appropriate error (ENOTDIR and ENOENT respectively).
+ // Lookup should return an appropriate inode if name should resolve to a
+ // child of this directory inode. This gives the directory an opportunity
+ // on every lookup to resolve additional entries. This is only called when
+ // the inode is a directory.
//
- // The child returned by Lookup will be hashed into the VFS dentry tree. Its
- // lifetime can be controlled by the filesystem implementation with an
- // appropriate implementation of Valid.
+ // The child returned by Lookup will be hashed into the VFS dentry tree,
+ // at least for the duration of the current FS operation.
//
- // Lookup returns the child with an extra reference and the caller owns this
- // reference.
- Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
-
- // Valid should return true if this inode is still valid, or needs to
- // be resolved again by a call to Lookup.
- Valid(ctx context.Context) bool
+ // Lookup must return the child with an extra reference whose ownership is
+ // transferred to the dentry that is created to point to that inode. If
+ // Inode.Keep returns false, that new dentry will be dropped at the end of
+ // the current filesystem operation (before returning back to the VFS
+ // layer) if no other ref is picked on that dentry. If Inode.Keep returns
+ // true, then the dentry will be cached into the dentry tree until it is
+ // Unlink'd or RmDir'd.
+ Lookup(ctx context.Context, name string) (Inode, error)
// IterDirents is used to iterate over dynamically created entries. It invokes
- // cb on each entry in the directory represented by the FileDescription.
+ // cb on each entry in the directory represented by the Inode.
// 'offset' is the offset for the entire IterDirents call, which may include
- // results from the caller. 'relOffset' is the offset inside the entries
- // returned by this IterDirents invocation. In other words,
- // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
- // while 'relOffset' is the place where iteration should start from.
- IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+ // results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+ // inside the entries returned by this IterDirents invocation. In other words,
+ // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+ // the return value, while 'relOffset' is the place to start iteration.
+ IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
}
type inodeSymlink interface {
// Readlink returns the target of a symbolic link. If an inode is not a
// symlink, the implementation should return EINVAL.
- Readlink(ctx context.Context) (string, error)
+ Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
// Getlink returns the target of a symbolic link, as used by path
// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 412cf6ac9..2418eec44 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file."
// RootDentryFn is a generator function for creating the root dentry of a test
// filesystem. See newTestSystem.
-type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+type RootDentryFn func(context.Context, *auth.Credentials, *filesystem) kernfs.Inode
// newTestSystem sets up a minimal environment for running a test, including an
// instance of a test filesystem. Tests can control the contents of the
@@ -46,13 +46,13 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
v := &vfs.VirtualFilesystem{}
- if err := v.Init(); err != nil {
+ if err := v.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+ mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create testfs root mount: %v", err)
}
@@ -72,14 +72,11 @@ type file struct {
content string
}
-func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+func (fs *filesystem) newFile(ctx context.Context, creds *auth.Credentials, content string) kernfs.Inode {
f := &file{}
f.content = content
- f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
-
- d := &kernfs.Dentry{}
- d.Init(f)
- return d
+ f.DynamicBytesFile.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
+ return f
}
func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
@@ -96,96 +93,112 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
}
type readonlyDir struct {
+ readonlyDirRefs
attrs
- kernfs.InodeNotSymlink
- kernfs.InodeNoDynamicLookup
+ kernfs.InodeAlwaysValid
kernfs.InodeDirectoryNoNewChildren
-
+ kernfs.InodeNoStatFS
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
- dentry kernfs.Dentry
+
+ locks vfs.FileLocks
}
-func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newReadonlyDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
dir := &readonlyDir{}
- dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+ dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- dir.dentry.Init(dir)
-
- dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
- return &dir.dentry
+ dir.EnableLeakCheck()
+ dir.IncLinks(dir.OrderedChildren.Populate(contents))
+ return dir
}
-func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+func (d *readonlyDir) DecRef(ctx context.Context) {
+ d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
type dir struct {
+ dirRefs
attrs
+ kernfs.InodeAlwaysValid
kernfs.InodeNotSymlink
- kernfs.InodeNoDynamicLookup
-
- fs *filesystem
- dentry kernfs.Dentry
+ kernfs.InodeNoStatFS
+ kernfs.InodeTemporary
kernfs.OrderedChildren
+
+ locks vfs.FileLocks
+
+ fs *filesystem
}
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
dir := &dir{}
dir.fs = fs
- dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+ dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
- dir.dentry.Init(dir)
-
- dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+ dir.EnableLeakCheck()
- return &dir.dentry
+ dir.IncLinks(dir.OrderedChildren.Populate(contents))
+ return dir
}
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) DecRef(ctx context.Context) {
+ d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
creds := auth.CredentialsFromContext(ctx)
- dir := d.fs.newDir(creds, opts.Mode, nil)
- dirVFSD := dir.VFSDentry()
- if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
- dir.DecRef()
+ dir := d.fs.newDir(ctx, creds, opts.Mode, nil)
+ if err := d.OrderedChildren.Insert(name, dir); err != nil {
+ dir.DecRef(ctx)
return nil, err
}
+ d.TouchCMtime(ctx)
d.IncLinks(1)
- return dirVFSD, nil
+ return dir, nil
}
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
creds := auth.CredentialsFromContext(ctx)
- f := d.fs.newFile(creds, "")
- fVFSD := f.VFSDentry()
- if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
- f.DecRef()
+ f := d.fs.newFile(ctx, creds, "")
+ if err := d.OrderedChildren.Insert(name, f); err != nil {
+ f.DecRef(ctx)
return nil, err
}
- return fVFSD, nil
+ d.TouchCMtime(ctx)
+ return f, nil
}
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) {
return nil, syserror.EPERM
}
@@ -193,29 +206,33 @@ func (fsType) Name() string {
return "kernfs"
}
+func (fsType) Release(ctx context.Context) {}
+
func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
fs := &filesystem{}
fs.VFSFilesystem().Init(vfsObj, &fst, fs)
- root := fst.rootFn(creds, fs)
- return fs.VFSFilesystem(), root.VFSDentry(), nil
+ root := fst.rootFn(ctx, creds, fs)
+ var d kernfs.Dentry
+ d.Init(&fs.Filesystem, root)
+ return fs.VFSFilesystem(), d.VFSDentry(), nil
}
// -------------------- Remainder of the file are test cases --------------------
func TestBasic(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
- "file1": fs.newFile(creds, staticFileContent),
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "file1": fs.newFile(ctx, creds, staticFileContent),
})
})
defer sys.Destroy()
- sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
+ sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx)
}
func TestMkdirGetDentry(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
- "dir1": fs.newDir(creds, 0755, nil),
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "dir1": fs.newDir(ctx, creds, 0755, nil),
})
})
defer sys.Destroy()
@@ -224,13 +241,13 @@ func TestMkdirGetDentry(t *testing.T) {
if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
}
- sys.GetDentryOrDie(pop).DecRef()
+ sys.GetDentryOrDie(pop).DecRef(sys.Ctx)
}
func TestReadStaticFile(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
- "file1": fs.newFile(creds, staticFileContent),
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "file1": fs.newFile(ctx, creds, staticFileContent),
})
})
defer sys.Destroy()
@@ -242,7 +259,7 @@ func TestReadStaticFile(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(sys.Ctx)
content, err := sys.ReadToEnd(fd)
if err != nil {
@@ -254,9 +271,9 @@ func TestReadStaticFile(t *testing.T) {
}
func TestCreateNewFileInStaticDir(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
- "dir1": fs.newDir(creds, 0755, nil),
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "dir1": fs.newDir(ctx, creds, 0755, nil),
})
})
defer sys.Destroy()
@@ -269,7 +286,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
}
// Close the file. The file should persist.
- fd.DecRef()
+ fd.DecRef(sys.Ctx)
fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
Flags: linux.O_RDONLY,
@@ -277,12 +294,12 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
}
- fd.DecRef()
+ fd.DecRef(sys.Ctx)
}
func TestDirFDReadWrite(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, nil)
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, nil)
})
defer sys.Destroy()
@@ -293,7 +310,7 @@ func TestDirFDReadWrite(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(sys.Ctx)
// Read/Write should fail for directory FDs.
if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
@@ -305,14 +322,14 @@ func TestDirFDReadWrite(t *testing.T) {
}
func TestDirFDIterDirents(t *testing.T) {
- sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
- return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
// Fill root with nodes backed by various inode implementations.
- "dir1": fs.newReadonlyDir(creds, 0755, nil),
- "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
- "dir3": fs.newDir(creds, 0755, nil),
+ "dir1": fs.newReadonlyDir(ctx, creds, 0755, nil),
+ "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "dir3": fs.newDir(ctx, creds, 0755, nil),
}),
- "file1": fs.newFile(creds, staticFileContent),
+ "file1": fs.newFile(ctx, creds, staticFileContent),
})
})
defer sys.Destroy()
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..a0736c0d6 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,10 +24,13 @@ import (
// StaticSymlink provides an Inode implementation for symlinks that point to
// a immutable target.
+//
+// +stateify savable
type StaticSymlink struct {
InodeAttrs
InodeNoopRefCount
InodeSymlink
+ InodeNoStatFS
target string
}
@@ -35,23 +38,20 @@ type StaticSymlink struct {
var _ Inode = (*StaticSymlink)(nil)
// NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
inode := &StaticSymlink{}
- inode.Init(creds, devMajor, devMinor, ino, target)
-
- d := &Dentry{}
- d.Init(inode)
- return d
+ inode.Init(ctx, creds, devMajor, devMinor, ino, target)
+ return inode
}
// Init initializes the instance.
-func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
+func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
s.target = target
- s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
+ s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
}
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
return s.target, nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..463d77d79
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,113 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+ InodeAlwaysValid
+ InodeAttrs
+ InodeNoStatFS
+ InodeNotSymlink
+ OrderedChildren
+ syntheticDirectoryRefs
+
+ locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(ctx context.Context, creds *auth.Credentials, perm linux.FileMode) Inode {
+ inode := &syntheticDirectory{}
+ inode.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+ return inode
+}
+
+func (dir *syntheticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+ if perm&^linux.PermissionsMask != 0 {
+ panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+ }
+ dir.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+ dir.OrderedChildren.Init(OrderedChildrenOptions{
+ Writable: true,
+ })
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+ if err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) {
+ return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) {
+ if !opts.ForSyntheticMountpoint {
+ return nil, syserror.EPERM
+ }
+ subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+ if err := dir.OrderedChildren.Insert(name, subdirI); err != nil {
+ subdirI.DecRef(ctx)
+ return nil, err
+ }
+ dir.TouchCMtime(ctx)
+ return subdirI, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) {
+ return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) {
+ return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) {
+ return nil, syserror.EPERM
+}
+
+// DecRef implements Inode.DecRef.
+func (dir *syntheticDirectory) DecRef(ctx context.Context) {
+ dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) })
+}
+
+// Keep implements Inode.Keep. This is redundant because inodes will never be
+// created via Lookup and inodes are always valid. Makes sense to return true
+// because these inodes are not temporary and should only be removed on RmDir.
+func (dir *syntheticDirectory) Keep() bool {
+ return true
+}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
new file mode 100644
index 000000000..fd6c55921
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -0,0 +1,46 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+ name = "fstree",
+ out = "fstree.go",
+ package = "overlay",
+ prefix = "generic",
+ template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+ types = {
+ "Dentry": "dentry",
+ },
+)
+
+go_library(
+ name = "overlay",
+ srcs = [
+ "copy_up.go",
+ "directory.go",
+ "filesystem.go",
+ "fstree.go",
+ "overlay.go",
+ "regular_file.go",
+ "save_restore.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/log",
+ "//pkg/refsvfs2",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
new file mode 100644
index 000000000..4506642ca
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -0,0 +1,429 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "fmt"
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isCopiedUp() bool {
+ return atomic.LoadUint32(&d.copiedUp) != 0
+}
+
+// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok().
+//
+// Preconditions: filesystem.renameMu must be locked.
+func (d *dentry) copyUpLocked(ctx context.Context) error {
+ // Fast path.
+ if d.isCopiedUp() {
+ return nil
+ }
+
+ // Attach our credentials to the context, as some VFS operations use
+ // credentials from context rather an take an explicit creds parameter.
+ ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
+ ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+ switch ftype {
+ case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
+ // Can be copied-up.
+ default:
+ // Can't be copied-up.
+ return syserror.EPERM
+ }
+
+ // Ensure that our parent directory is copied-up.
+ if d.parent == nil {
+ // d is a filesystem root with no upper layer.
+ return syserror.EROFS
+ }
+ if err := d.parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ if d.upperVD.Ok() {
+ // Raced with another call to d.copyUpLocked().
+ return nil
+ }
+ if d.vfsd.IsDead() {
+ // Raced with deletion of d.
+ return syserror.ENOENT
+ }
+
+ // Obtain settable timestamps from the lower layer.
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ oldpop := vfs.PathOperation{
+ Root: d.lowerVDs[0],
+ Start: d.lowerVDs[0],
+ }
+ const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME
+ oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{
+ Mask: timestampsMask,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Perform copy-up.
+ newpop := vfs.PathOperation{
+ Root: d.parent.upperVD,
+ Start: d.parent.upperVD,
+ Path: fspath.Parse(d.name),
+ }
+ // Used during copy-up of memory-mapped regular files.
+ var mmapOpts *memmap.MMapOpts
+ cleanupUndoCopyUp := func() {
+ var err error
+ if ftype == linux.S_IFDIR {
+ err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop)
+ } else {
+ err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
+ }
+ if err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+ }
+ if d.upperVD.Ok() {
+ d.upperVD.DecRef(ctx)
+ d.upperVD = vfs.VirtualDentry{}
+ }
+ }
+ switch ftype {
+ case linux.S_IFREG:
+ oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err != nil {
+ return err
+ }
+ defer oldFD.DecRef(ctx)
+ newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ })
+ if err != nil {
+ return err
+ }
+ defer newFD.DecRef(ctx)
+ bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+ for {
+ readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
+ if readErr != nil && readErr != io.EOF {
+ cleanupUndoCopyUp()
+ return readErr
+ }
+ total := int64(0)
+ for total < readN {
+ writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
+ total += writeN
+ if writeErr != nil {
+ cleanupUndoCopyUp()
+ return writeErr
+ }
+ }
+ if readErr == io.EOF {
+ break
+ }
+ }
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if d.wrappedMappable != nil {
+ // We may have memory mappings of the file on the lower layer.
+ // Switch to mapping the file on the upper layer instead.
+ mmapOpts = &memmap.MMapOpts{
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.ReadWrite,
+ }
+ if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ if mmapOpts.MappingIdentity != nil {
+ mmapOpts.MappingIdentity.DecRef(ctx)
+ }
+ // Don't actually switch Mappables until the end of copy-up; see
+ // below for why.
+ }
+ if err := newFD.SetStat(ctx, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = newFD.VirtualDentry()
+ d.upperVD.IncRef()
+
+ case linux.S_IFDIR:
+ if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
+ Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ }); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ case linux.S_IFLNK:
+ target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop)
+ if err != nil {
+ return err
+ }
+ if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ Mode: uint16(d.mode),
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ case linux.S_IFBLK, linux.S_IFCHR:
+ if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
+ Mode: linux.FileMode(d.mode),
+ DevMajor: oldStat.RdevMajor,
+ DevMinor: oldStat.RdevMinor,
+ }); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+ UID: d.uid,
+ GID: d.gid,
+ Atime: oldStat.Atime,
+ Mtime: oldStat.Mtime,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ default:
+ // Should have rejected this at the beginning of this function?
+ panic(fmt.Sprintf("unexpected file type %o", ftype))
+ }
+
+ if err := d.copyXattrsLocked(ctx); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+
+ // Update the dentry's device and inode numbers (except for directories,
+ // for which these remain overlay-assigned).
+ if ftype != linux.S_IFDIR {
+ upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_INO,
+ })
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ if upperStat.Mask&linux.STATX_INO == 0 {
+ cleanupUndoCopyUp()
+ return syserror.EREMOTE
+ }
+ atomic.StoreUint32(&d.devMajor, upperStat.DevMajor)
+ atomic.StoreUint32(&d.devMinor, upperStat.DevMinor)
+ atomic.StoreUint64(&d.ino, upperStat.Ino)
+ }
+
+ if mmapOpts != nil && mmapOpts.Mappable != nil {
+ // Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+ // (from the S_IFREG path above).
+
+ // Propagate mappings of d to the new Mappable. Remember which mappings
+ // we added so we can remove them on failure.
+ upperMappable := mmapOpts.Mappable
+ allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ added := make(memmap.MappingsOfRange)
+ for m := range seg.Value() {
+ if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+ for m := range added {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ for mr, mappings := range allAdded {
+ for m := range mappings {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+ }
+ }
+ return err
+ }
+ added[m] = struct{}{}
+ }
+ allAdded[seg.Range()] = added
+ }
+
+ // Switch to the new Mappable. We do this at the end of copy-up
+ // because:
+ //
+ // - We need to switch Mappables (by changing d.wrappedMappable) before
+ // invalidating Translations from the old Mappable (to pick up
+ // Translations from the new one).
+ //
+ // - We need to lock d.dataMu while changing d.wrappedMappable, but
+ // must invalidate Translations with d.dataMu unlocked (due to lock
+ // ordering).
+ //
+ // - Consequently, once we unlock d.dataMu, other threads may
+ // immediately observe the new (copied-up) Mappable, which we want to
+ // delay until copy-up is guaranteed to succeed.
+ d.dataMu.Lock()
+ lowerMappable := d.wrappedMappable
+ d.wrappedMappable = upperMappable
+ d.dataMu.Unlock()
+ d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Remove mappings from the old Mappable.
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ for m := range seg.Value() {
+ lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ }
+ d.lowerMappings.RemoveAll()
+ }
+
+ atomic.StoreUint32(&d.copiedUp, 1)
+ return nil
+}
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+ upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+ lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+ if err != nil {
+ if err == syserror.EOPNOTSUPP {
+ // There are no guarantees as to the contents of lowerXattrs.
+ return nil
+ }
+ ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+ return err
+ }
+
+ for _, name := range lowerXattrs {
+ // Do not copy up overlay attributes.
+ if isOverlayXattr(name) {
+ continue
+ }
+
+ value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+ if err != nil {
+ ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+ return err
+ }
+
+ if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+ ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+ return err
+ }
+ }
+ return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+ dirents, err := d.getDirentsLocked(ctx)
+ if err != nil {
+ return err
+ }
+ for _, dirent := range dirents {
+ if dirent.Name == "." || dirent.Name == ".." {
+ continue
+ }
+ child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+ if err != nil {
+ return err
+ }
+ if err := child.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ if child.isDir() {
+ child.dirMu.Lock()
+ err := child.copyUpDescendantsLocked(ctx, ds)
+ child.dirMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
new file mode 100644
index 000000000..df4492346
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -0,0 +1,301 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ var readdirErr error
+ whiteouts := make(map[string]bool)
+ var maybeWhiteouts []string
+ d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+ layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ defer layerFD.DecRef(ctx)
+
+ // Reuse slice allocated for maybeWhiteouts from a previous layer to
+ // reduce allocations.
+ maybeWhiteouts = maybeWhiteouts[:0]
+ err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name == "." || dirent.Name == ".." {
+ return nil
+ }
+ if _, ok := whiteouts[dirent.Name]; ok {
+ // This file has been whited-out in a previous layer.
+ return nil
+ }
+ if dirent.Type == linux.DT_CHR {
+ // We have to determine if this is a whiteout, which doesn't
+ // count against the directory's emptiness. However, we can't
+ // do so while holding locks held by layerFD.IterDirents().
+ maybeWhiteouts = append(maybeWhiteouts, dirent.Name)
+ return nil
+ }
+ // Non-whiteout file in the directory prevents rmdir.
+ return syserror.ENOTEMPTY
+ }))
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+
+ for _, maybeWhiteoutName := range maybeWhiteouts {
+ stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ Path: fspath.Parse(maybeWhiteoutName),
+ }, &vfs.StatOptions{})
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ if stat.RdevMajor != 0 || stat.RdevMinor != 0 {
+ // This file is a real character device, not a whiteout.
+ readdirErr = syserror.ENOTEMPTY
+ return false
+ }
+ whiteouts[maybeWhiteoutName] = isUpper
+ }
+ // Continue iteration since we haven't found any non-whiteout files in
+ // this directory yet.
+ return true
+ })
+ return whiteouts, readdirErr
+}
+
+// +stateify savable
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+
+ mu sync.Mutex `state:"nosave"`
+ off int64
+ dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release(ctx context.Context) {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ d := fd.dentry()
+ defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ if fd.dirents == nil {
+ ds, err := d.getDirents(ctx)
+ if err != nil {
+ return err
+ }
+ fd.dirents = ds
+ }
+
+ for fd.off < int64(len(fd.dirents)) {
+ if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+ return err
+ }
+ fd.off++
+ }
+ return nil
+}
+
+// Preconditions: d.isDir().
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+ d.fs.renameMu.RLock()
+ defer d.fs.renameMu.RUnlock()
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ return d.getDirentsLocked(ctx)
+}
+
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
+ if d.dirents != nil {
+ return d.dirents, nil
+ }
+
+ parent := genericParentOrSelf(d)
+ dirents := []vfs.Dirent{
+ {
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: d.ino,
+ NextOff: 1,
+ },
+ {
+ Name: "..",
+ Type: uint8(atomic.LoadUint32(&parent.mode) >> 12),
+ Ino: parent.ino,
+ NextOff: 2,
+ },
+ }
+
+ // Merge dirents from all layers comprising this directory.
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ var readdirErr error
+ prevDirents := make(map[string]struct{})
+ var maybeWhiteouts []vfs.Dirent
+ d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+ layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ defer layerFD.DecRef(ctx)
+
+ // Reuse slice allocated for maybeWhiteouts from a previous layer to
+ // reduce allocations.
+ maybeWhiteouts = maybeWhiteouts[:0]
+ err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name == "." || dirent.Name == ".." {
+ return nil
+ }
+ if _, ok := prevDirents[dirent.Name]; ok {
+ // This file is hidden by, or merged with, another file with
+ // the same name in a previous layer.
+ return nil
+ }
+ prevDirents[dirent.Name] = struct{}{}
+ if dirent.Type == linux.DT_CHR {
+ // We can't determine if this file is a whiteout while holding
+ // locks held by layerFD.IterDirents().
+ maybeWhiteouts = append(maybeWhiteouts, dirent)
+ return nil
+ }
+ dirent.NextOff = int64(len(dirents) + 1)
+ dirents = append(dirents, dirent)
+ return nil
+ }))
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+
+ for _, dirent := range maybeWhiteouts {
+ stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ Path: fspath.Parse(dirent.Name),
+ }, &vfs.StatOptions{})
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ if stat.RdevMajor == 0 && stat.RdevMinor == 0 {
+ // This file is a whiteout; don't emit a dirent for it.
+ continue
+ }
+ dirent.NextOff = int64(len(dirents) + 1)
+ dirents = append(dirents, dirent)
+ }
+ return true
+ })
+ if readdirErr != nil {
+ return nil, readdirErr
+ }
+
+ // Cache dirents for future directoryFDs.
+ d.dirents = dirents
+ return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if offset == 0 {
+ // Ensure that the next call to fd.IterDirents() calls
+ // fd.dentry().getDirents().
+ fd.dirents = nil
+ }
+ fd.off = offset
+ return fd.off, nil
+ case linux.SEEK_CUR:
+ offset += fd.off
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ // Don't clear fd.dirents in this case, even if offset == 0.
+ fd.off = offset
+ return fd.off, nil
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper
+// layer, if there is one. The lower layer doesn't need to sync because it
+// never changes.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+ d := fd.dentry()
+ if !d.isCopiedUp() {
+ return nil
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }
+ upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+ if err != nil {
+ return err
+ }
+ err = upperFD.Sync(ctx)
+ upperFD.DecRef(ctx)
+ return err
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
new file mode 100644
index 000000000..10161a08d
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -0,0 +1,1780 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "fmt"
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
+// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
+// opaque directories.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
+
+func isWhiteout(stat *linux.Statx) bool {
+ return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ if fs.opts.UpperRoot.Ok() {
+ return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
+ }
+ return nil
+}
+
+var dentrySlicePool = sync.Pool{
+ New: func() interface{} {
+ ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+ return &ds
+ },
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+ if ds == nil {
+ ds = dentrySlicePool.Get().(*[]*dentry)
+ }
+ *ds = append(*ds, d)
+ return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+ // Allow dentries to be GC'd.
+ for i := range *ds {
+ (*ds)[i] = nil
+ }
+ *ds = (*ds)[:0]
+ dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ fs.renameMu.RUnlock()
+ if *ds == nil {
+ return
+ }
+ if len(**ds) != 0 {
+ fs.renameMu.Lock()
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ }
+ putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ if *ds == nil {
+ fs.renameMu.Unlock()
+ return
+ }
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if child, ok := parent.children[name]; ok {
+ return child, nil
+ }
+ child, err := fs.lookupLocked(ctx, parent, name)
+ if err != nil {
+ return nil, err
+ }
+ if parent.children == nil {
+ parent.children = make(map[string]*dentry)
+ }
+ parent.children[name] = child
+ // child's refcount is initially 0, so it may be dropped after traversal.
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+ childPath := fspath.Parse(name)
+ child := fs.newDentry()
+ existsOnAnyLayer := false
+ var lookupErr error
+
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+ childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parentVD,
+ Start: parentVD,
+ Path: childPath,
+ }, &vfs.GetDentryOptions{})
+ if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+ // The file doesn't exist on this layer. Proceed to the next one.
+ return true
+ }
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+ defer childVD.DecRef(ctx)
+
+ mask := uint32(linux.STATX_TYPE)
+ if !existsOnAnyLayer {
+ // Mode, UID, GID, and (for non-directories) inode number come from
+ // the topmost layer on which the file exists.
+ mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ }
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+ if stat.Mask&mask != mask {
+ lookupErr = syserror.EREMOTE
+ return false
+ }
+
+ if isWhiteout(&stat) {
+ // This is a whiteout, so it "doesn't exist" on this layer, and
+ // layers below this one are ignored.
+ return false
+ }
+ isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
+ if existsOnAnyLayer && !isDir {
+ // Directories are not merged with non-directory files from lower
+ // layers; instead, layers including and below the first
+ // non-directory file are ignored. (This file must be a directory
+ // on previous layers, since lower layers aren't searched for
+ // non-directory files.)
+ return false
+ }
+
+ // Update child to include this layer.
+ childVD.IncRef()
+ if isUpper {
+ child.upperVD = childVD
+ child.copiedUp = 1
+ } else {
+ child.lowerVDs = append(child.lowerVDs, childVD)
+ }
+ if !existsOnAnyLayer {
+ existsOnAnyLayer = true
+ child.mode = uint32(stat.Mode)
+ child.uid = stat.UID
+ child.gid = stat.GID
+ child.devMajor = stat.DevMajor
+ child.devMinor = stat.DevMinor
+ child.ino = stat.Ino
+ }
+
+ // For non-directory files, only the topmost layer that contains a file
+ // matters.
+ if !isDir {
+ return false
+ }
+
+ // Directories are merged with directories from lower layers if they
+ // are not explicitly opaque.
+ opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.GetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Size: 1,
+ })
+ return !(err == nil && opaqueVal == "y")
+ })
+
+ if lookupErr != nil {
+ child.destroyLocked(ctx)
+ return nil, lookupErr
+ }
+ if !existsOnAnyLayer {
+ child.destroyLocked(ctx)
+ return nil, syserror.ENOENT
+ }
+
+ // Device and inode numbers were copied from the topmost layer above;
+ // override them if necessary.
+ if child.isDir() {
+ child.devMajor = linux.UNNAMED_MAJOR
+ child.devMinor = fs.dirDevMinor
+ child.ino = fs.newDirIno()
+ } else if !child.upperVD.Ok() {
+ childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor)
+ if err != nil {
+ ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ child.devMajor = linux.UNNAMED_MAJOR
+ child.devMinor = childDevMinor
+ }
+
+ parent.IncRef()
+ child.parent = parent
+ child.name = name
+ return child, nil
+}
+
+// lookupLayerLocked is similar to lookupLocked, but only returns information
+// about the file rather than a dentry.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
+ childPath := fspath.Parse(name)
+ lookupLayer := lookupLayerNone
+ var lookupErr error
+
+ parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+ stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parentVD,
+ Start: parentVD,
+ Path: childPath,
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_TYPE,
+ })
+ if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+ // The file doesn't exist on this layer. Proceed to the next
+ // one.
+ return true
+ }
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+ if stat.Mask&linux.STATX_TYPE == 0 {
+ // Linux's overlayfs tends to return EREMOTE in cases where a file
+ // is unusable for reasons that are not better captured by another
+ // errno.
+ lookupErr = syserror.EREMOTE
+ return false
+ }
+ if isWhiteout(&stat) {
+ // This is a whiteout, so it "doesn't exist" on this layer, and
+ // layers below this one are ignored.
+ if isUpper {
+ lookupLayer = lookupLayerUpperWhiteout
+ }
+ return false
+ }
+ // The file exists; we can stop searching.
+ if isUpper {
+ lookupLayer = lookupLayerUpper
+ } else {
+ lookupLayer = lookupLayerLower
+ }
+ return false
+ })
+
+ return lookupLayer, lookupErr
+}
+
+type lookupLayer int
+
+const (
+ // lookupLayerNone indicates that no file exists at the given path on the
+ // upper layer, and is either whited out or does not exist on lower layers.
+ // Therefore, the file does not exist in the overlay filesystem, and file
+ // creation may proceed normally (if an upper layer exists).
+ lookupLayerNone lookupLayer = iota
+
+ // lookupLayerLower indicates that no file exists at the given path on the
+ // upper layer, but exists on a lower layer. Therefore, the file exists in
+ // the overlay filesystem, but must be copied-up before mutation.
+ lookupLayerLower
+
+ // lookupLayerUpper indicates that a non-whiteout file exists at the given
+ // path on the upper layer. Therefore, the file exists in the overlay
+ // filesystem, and is already copied-up.
+ lookupLayerUpper
+
+ // lookupLayerUpperWhiteout indicates that a whiteout exists at the given
+ // path on the upper layer. Therefore, the file does not exist in the
+ // overlay filesystem, and file creation must remove the whiteout before
+ // proceeding.
+ lookupLayerUpperWhiteout
+)
+
+func (ll lookupLayer) existsInOverlay() bool {
+ return ll == lookupLayerLower || ll == lookupLayerUpper
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EEXIST
+ }
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+ if parent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Determine if a file already exists at name.
+ if _, ok := parent.children[name]; ok {
+ return syserror.EEXIST
+ }
+ childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
+ if err != nil {
+ return err
+ }
+ if childLayer.existsInOverlay() {
+ return syserror.EEXIST
+ }
+
+ // Ensure that the parent directory is copied-up so that we can create the
+ // new file in the upper layer.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ // Finally create the new file.
+ if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
+ return err
+ }
+
+ parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
+ return nil
+}
+
+// Preconditions: pop's parent directory has been copied up.
+func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error {
+ return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{
+ Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
+ // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
+ })
+}
+
+func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
+ if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
+ }
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return d.checkPermissions(creds, ats)
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ layerVD := d.topLayer()
+ return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &opts)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ old := vd.Dentry().Impl().(*dentry)
+ if old.isDir() {
+ return syserror.EPERM
+ }
+ if err := old.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ newpop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: old.upperVD,
+ Start: old.upperVD,
+ }, &newpop); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+ }
+ return err
+ }
+ old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
+ return nil
+ })
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ if haveUpperWhiteout {
+ // There may be directories on lower layers (previously hidden by
+ // the whiteout) that the new directory should not be merged with.
+ // Mark it opaque to prevent merging.
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
+ } else {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ }
+ return nil
+ })
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ // Disallow attempts to create whiteouts.
+ if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
+ return syserror.EPERM
+ }
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ return nil
+ })
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ mayCreate := opts.Flags&linux.O_CREAT != 0
+ mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+ mayWrite := vfs.AccessTypesForOpenFlags(&opts).MayWrite()
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ unlocked = true
+ }
+ }
+ defer unlock()
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ if start.isRegularFile() && mayWrite {
+ if err := start.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+ }
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
+ return start.openCopiedUp(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err == syserror.ENOENT && mayCreate {
+ fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+ parent.dirMu.Unlock()
+ return fd, err
+ }
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ // Open existing child or follow symlink.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ if rp.MustBeDir() && !child.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if child.isRegularFile() && mayWrite {
+ if err := child.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+ }
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
+ return child.openCopiedUp(ctx, rp, &opts)
+}
+
+// Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has
+// been copied up.
+func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ mnt := rp.Mount()
+
+ // Directory FDs open FDs from each layer when directory entries are read,
+ // so they don't require opening an FD from d.topLayer() up front.
+ ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+ if ftype == linux.S_IFDIR {
+ // Can't open directories with O_CREAT.
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EISDIR
+ }
+ // Can't open directories writably.
+ if ats.MayWrite() {
+ return nil, syserror.EISDIR
+ }
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ fd := &directoryFD{}
+ fd.LockFD.Init(&d.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+ }
+
+ layerVD, isUpper := d.topLayerInfo()
+ layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, opts)
+ if err != nil {
+ return nil, err
+ }
+ if ftype != linux.S_IFREG {
+ return layerFD, nil
+ }
+ layerFlags := layerFD.StatusFlags()
+ fd := &regularFileFD{
+ copiedUp: isUpper,
+ cachedFD: layerFD,
+ cachedFlags: layerFlags,
+ }
+ fd.LockFD.Init(&d.locks)
+ layerFDOpts := layerFD.Options()
+ if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
+ layerFD.DecRef(ctx)
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+ creds := rp.Credentials()
+ if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ if parent.vfsd.IsDead() {
+ return nil, syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer mnt.EndWrite()
+
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ childName := rp.Component()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ // We don't know if a whiteout exists on the upper layer; speculatively
+ // unlink it.
+ //
+ // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
+ // know whether a whiteout exists.
+ var haveUpperWhiteout bool
+ switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
+ case nil:
+ haveUpperWhiteout = true
+ case syserror.ENOENT:
+ haveUpperWhiteout = false
+ default:
+ return nil, err
+ }
+ // Create the file on the upper layer, and get an FD representing it.
+ upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
+ Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
+ Mode: opts.Mode,
+ })
+ if err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Change the file's owner to the caller. We can't use upperFD.SetStat()
+ // because it will pick up creds from ctx.
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Re-lookup to get a dentry representing the new file, which is needed for
+ // the returned FD.
+ child, err := fs.getChildLocked(ctx, parent, childName, ds)
+ if err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Finally construct the overlay FD.
+ upperFlags := upperFD.StatusFlags()
+ fd := &regularFileFD{
+ copiedUp: true,
+ cachedFD: upperFD,
+ cachedFlags: upperFlags,
+ }
+ fd.LockFD.Init(&child.locks)
+ upperFDOpts := upperFD.Options()
+ if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
+ upperFD.DecRef(ctx)
+ // Don't bother with cleanup; the file was created successfully, we
+ // just can't open it anymore for some reason.
+ return nil, err
+ }
+ parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
+ return &fd.vfsfd, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ layerVD := d.topLayer()
+ return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ })
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ if opts.Flags != 0 {
+ return syserror.EINVAL
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.Lock()
+ defer fs.renameMuUnlockAndCheckDrop(ctx, &ds)
+ newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+ if err != nil {
+ return err
+ }
+ newName := rp.Component()
+ if newName == "." || newName == ".." {
+ return syserror.EBUSY
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ // We need a dentry representing the renamed file since, if it's a
+ // directory, we need to check for write permission on it.
+ oldParent.dirMu.Lock()
+ defer oldParent.dirMu.Unlock()
+ renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+ if err != nil {
+ return err
+ }
+ if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+ return err
+ }
+ if renamed.isDir() {
+ if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if oldParent != newParent {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ newParent.dirMu.Lock()
+ defer newParent.dirMu.Unlock()
+ }
+ if newParent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+ if err != nil {
+ return err
+ }
+ var (
+ replaced *dentry
+ replacedVFSD *vfs.Dentry
+ whiteouts map[string]bool
+ )
+ if replacedLayer.existsInOverlay() {
+ replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil {
+ return err
+ }
+ replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
+ replaced.dirMu.Lock()
+ defer replaced.dirMu.Unlock()
+ whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ }
+
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+
+ // renamed and oldParent need to be copied-up before they're renamed on the
+ // upper layer.
+ if err := renamed.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If renamed is a directory, all of its descendants need to be copied-up
+ // before they're renamed on the upper layer.
+ if renamed.isDir() {
+ if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+ return err
+ }
+ }
+ // newParent must be copied-up before it can contain renamed on the upper
+ // layer.
+ if err := newParent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If replaced exists, it doesn't need to be copied-up, but we do need to
+ // serialize with copy-up. Holding renameMu for writing should be
+ // sufficient, but out of an abundance of caution...
+ if replaced != nil {
+ replaced.copyMu.RLock()
+ defer replaced.copyMu.RUnlock()
+ }
+
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+ return err
+ }
+
+ newpop := vfs.PathOperation{
+ Root: newParent.upperVD,
+ Start: newParent.upperVD,
+ Path: fspath.Parse(newName),
+ }
+
+ needRecreateWhiteouts := false
+ cleanupRecreateWhiteouts := func() {
+ if !needRecreateWhiteouts {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+ }
+ }
+ }
+ if renamed.isDir() {
+ if replacedLayer == lookupLayerUpper {
+ // Remove whiteouts from the directory being replaced.
+ needRecreateWhiteouts = true
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ } else if replacedLayer == lookupLayerUpperWhiteout {
+ // We need to explicitly remove the whiteout since otherwise rename
+ // on the upper layer will fail with ENOTDIR.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ }
+
+ // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+ // regular rename and create the whiteout at the origin manually. Unlike
+ // RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+ // upper filesystem, but this is already the case for virtually all other
+ // overlay filesystem operations too.
+ oldpop := vfs.PathOperation{
+ Root: oldParent.upperVD,
+ Start: oldParent.upperVD,
+ Path: fspath.Parse(oldName),
+ }
+ if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+
+ // Below this point, the renamed dentry is now at newpop, and anything we
+ // replaced is gone forever. Commit the rename, update the overlay
+ // filesystem tree, and abandon attempts to recover from errors.
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+ delete(oldParent.children, oldName)
+ if replaced != nil {
+ ds = appendDentry(ds, replaced)
+ }
+ if oldParent != newParent {
+ newParent.dirents = nil
+ // This can't drop the last reference on oldParent because one is held
+ // by oldParentVD, so lock recursion is impossible.
+ oldParent.DecRef(ctx)
+ ds = appendDentry(ds, oldParent)
+ newParent.IncRef()
+ renamed.parent = newParent
+ }
+ renamed.name = newName
+ if newParent.children == nil {
+ newParent.children = make(map[string]*dentry)
+ }
+ newParent.children[newName] = renamed
+ oldParent.dirents = nil
+
+ if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+ }
+ if renamed.isDir() {
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+ }
+ }
+
+ vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+ return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ name := rp.Component()
+ if name == "." {
+ return syserror.EINVAL
+ }
+ if name == ".." {
+ return syserror.ENOTEMPTY
+ }
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Ensure that parent is copied-up before potentially holding child.copyMu
+ // below.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ // Unlike UnlinkAt, we need a dentry representing the child directory being
+ // removed in order to verify that it's empty.
+ child, err := fs.getChildLocked(ctx, parent, name, &ds)
+ if err != nil {
+ return err
+ }
+ if !child.isDir() {
+ return syserror.ENOTDIR
+ }
+ if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+ return err
+ }
+ child.dirMu.Lock()
+ defer child.dirMu.Unlock()
+ whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ child.copyMu.RLock()
+ defer child.copyMu.RUnlock()
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ return err
+ }
+
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(name),
+ }
+ if child.upperVD.Ok() {
+ cleanupRecreateWhiteouts := func() {
+ if !child.upperVD.Ok() {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: child.upperVD,
+ Start: child.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
+ }
+ }
+ }
+ // Remove existing whiteouts on the upper layer.
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.upperVD,
+ Start: child.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return err
+ }
+ }
+ // Remove the existing directory on the upper layer.
+ if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return err
+ }
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+ // Don't attempt to recover from this: the original directory is
+ // already gone, so any dentries representing it are invalid, and
+ // creating a new directory won't undo that.
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
+ }
+
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+ delete(parent.children, name)
+ ds = appendDentry(ds, child)
+ parent.dirents = nil
+ parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ return err
+ }
+ err = d.setStatLocked(ctx, rp, opts)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
+ }
+ return nil
+}
+
+// Precondition: d.fs.renameMu must be held for reading.
+func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ return err
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // Changes to d's attributes are serialized by d.copyMu.
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }, &opts); err != nil {
+ return err
+ }
+ d.updateAfterSetStatLocked(&opts)
+ return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ var stat linux.Statx
+ if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+ layerVD := d.topLayer()
+ stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.StatOptions{
+ Mask: layerMask,
+ Sync: opts.Sync,
+ })
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ d.statInternalTo(ctx, &opts, &stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ return fs.statFS(ctx)
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ return nil
+ })
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EISDIR
+ }
+ if rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Ensure that parent is copied-up before potentially holding child.copyMu
+ // below.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ parentMode := atomic.LoadUint32(&parent.mode)
+ child := parent.children[name]
+ var childLayer lookupLayer
+ if child == nil {
+ if parentMode&linux.S_ISVTX != 0 {
+ // If the parent's sticky bit is set, we need a child dentry to get
+ // its owner.
+ child, err = fs.getChildLocked(ctx, parent, name, &ds)
+ if err != nil {
+ return err
+ }
+ } else {
+ // Determine if the file being unlinked actually exists. Holding
+ // parent.dirMu prevents a dentry from being instantiated for the file,
+ // which in turn prevents it from being copied-up, so this result is
+ // stable.
+ childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+ if err != nil {
+ return err
+ }
+ if !childLayer.existsInOverlay() {
+ return syserror.ENOENT
+ }
+ }
+ }
+ if child != nil {
+ if child.isDir() {
+ return syserror.EISDIR
+ }
+ if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+ return err
+ }
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ return err
+ }
+ // Hold child.copyMu to prevent it from being copied-up during
+ // deletion.
+ child.copyMu.RLock()
+ defer child.copyMu.RUnlock()
+ if child.upperVD.Ok() {
+ childLayer = lookupLayerUpper
+ } else {
+ childLayer = lookupLayerLower
+ }
+ }
+
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(name),
+ }
+ if childLayer == lookupLayerUpper {
+ // Remove the existing file on the upper layer.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return err
+ }
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
+ }
+
+ var cw *vfs.Watches
+ if child != nil {
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+ delete(parent.children, name)
+ ds = appendDentry(ds, child)
+ cw = &child.watches
+ }
+ vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
+ parent.dirents = nil
+ return nil
+}
+
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+ return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+
+ return fs.listXattr(ctx, d, size)
+}
+
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+ if err != nil {
+ return nil, err
+ }
+
+ // Filter out all overlay attributes.
+ n := 0
+ for _, name := range names {
+ if !isOverlayXattr(name) {
+ names[n] = name
+ n++
+ }
+ }
+ return names[:n], err
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+
+ return fs.getXattr(ctx, d, rp.Credentials(), &opts)
+}
+
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+ return "", err
+ }
+
+ // Return EOPNOTSUPP when fetching an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_get().
+ if isOverlayXattr(opts.Name) {
+ return "", syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
+}
+
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ return err
+ }
+
+ err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+ return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Return EOPNOTSUPP when setting an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_set().
+ if isOverlayXattr(opts.Name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
+}
+
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ return err
+ }
+
+ err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+ return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+ // Linux passes the remove request to xattr_handler->set.
+ // See fs/xattr.c:vfs_removexattr().
+ if isOverlayXattr(name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
new file mode 100644
index 000000000..c812f0a70
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -0,0 +1,798 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package overlay provides an overlay filesystem implementation, which
+// synthesizes a filesystem by composing one or more immutable filesystems
+// ("lower layers") with an optional mutable filesystem ("upper layer").
+//
+// Lock order:
+//
+// directoryFD.mu / regularFileFD.mu
+// filesystem.renameMu
+// dentry.dirMu
+// dentry.copyMu
+// filesystem.devMu
+// *** "memmap.Mappable locks" below this point
+// dentry.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
+// dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package overlay
+
+import (
+ "fmt"
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "overlay"
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// Release implements FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
+// FilesystemType.GetFilesystem.
+//
+// +stateify savable
+type FilesystemOptions struct {
+ // Callers passing FilesystemOptions to
+ // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
+ // the vfs.Mounts comprising the layers of the overlay filesystem do not
+ // contain submounts.
+
+ // If UpperRoot.Ok(), it is the root of the writable upper layer of the
+ // overlay.
+ UpperRoot vfs.VirtualDentry
+
+ // LowerRoots contains the roots of the immutable lower layers of the
+ // overlay. LowerRoots is immutable.
+ LowerRoots []vfs.VirtualDentry
+}
+
+// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // Immutable options.
+ opts FilesystemOptions
+
+ // creds is a copy of the filesystem's creator's credentials, which are
+ // used for accesses to the filesystem's layers. creds is immutable.
+ creds *auth.Credentials
+
+ // dirDevMinor is the device minor number used for directories. dirDevMinor
+ // is immutable.
+ dirDevMinor uint32
+
+ // lowerDevMinors maps device numbers from lower layer filesystems to
+ // device minor numbers assigned to non-directory files originating from
+ // that filesystem. (This remapping is necessary for lower layers because a
+ // file on a lower layer, and that same file on an overlay, are
+ // distinguishable because they will diverge after copy-up; this isn't true
+ // for non-directory files already on the upper layer.) lowerDevMinors is
+ // protected by devMu.
+ devMu sync.Mutex `state:"nosave"`
+ lowerDevMinors map[layerDevNumber]uint32
+
+ // renameMu synchronizes renaming with non-renaming operations in order to
+ // ensure consistent lock ordering between dentry.dirMu in different
+ // dentries.
+ renameMu sync.RWMutex `state:"nosave"`
+
+ // lastDirIno is the last inode number assigned to a directory. lastDirIno
+ // is accessed using atomic memory operations.
+ lastDirIno uint64
+}
+
+// +stateify savable
+type layerDevNumber struct {
+ major uint32
+ minor uint32
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ fsoptsRaw := opts.InternalData
+ fsopts, ok := fsoptsRaw.(FilesystemOptions)
+ if fsoptsRaw != nil && !ok {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+ return nil, nil, syserror.EINVAL
+ }
+ vfsroot := vfs.RootFromContext(ctx)
+ if vfsroot.Ok() {
+ defer vfsroot.DecRef(ctx)
+ }
+
+ if upperPathname, ok := mopts["upperdir"]; ok {
+ if fsopts.UpperRoot.Ok() {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "upperdir")
+ // Linux overlayfs also requires a workdir when upperdir is
+ // specified; we don't, so silently ignore this option.
+ delete(mopts, "workdir")
+ upperPath := fspath.Parse(upperPathname)
+ if !upperPath.Absolute {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+ return nil, nil, syserror.EINVAL
+ }
+ upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+ Root: vfsroot,
+ Start: vfsroot,
+ Path: upperPath,
+ FollowFinalSymlink: true,
+ }, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+ return nil, nil, err
+ }
+ privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+ upperRoot.DecRef(ctx)
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+ return nil, nil, err
+ }
+ defer privateUpperRoot.DecRef(ctx)
+ fsopts.UpperRoot = privateUpperRoot
+ }
+
+ if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
+ if len(fsopts.LowerRoots) != 0 {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "lowerdir")
+ lowerPathnames := strings.Split(lowerPathnamesStr, ":")
+ for _, lowerPathname := range lowerPathnames {
+ lowerPath := fspath.Parse(lowerPathname)
+ if !lowerPath.Absolute {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+ return nil, nil, syserror.EINVAL
+ }
+ lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+ Root: vfsroot,
+ Start: vfsroot,
+ Path: lowerPath,
+ FollowFinalSymlink: true,
+ }, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+ return nil, nil, err
+ }
+ privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+ lowerRoot.DecRef(ctx)
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+ return nil, nil, err
+ }
+ defer privateLowerRoot.DecRef(ctx)
+ fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
+ }
+ }
+
+ if len(mopts) != 0 {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
+ if len(fsopts.LowerRoots) == 0 {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
+ return nil, nil, syserror.EINVAL
+ }
+ if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
+ return nil, nil, syserror.EINVAL
+ }
+ const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+ if len(fsopts.LowerRoots) > maxLowerLayers {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Allocate dirDevMinor. lowerDevMinors are allocated dynamically.
+ dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ // Take extra references held by the filesystem.
+ if fsopts.UpperRoot.Ok() {
+ fsopts.UpperRoot.IncRef()
+ }
+ for _, lowerRoot := range fsopts.LowerRoots {
+ lowerRoot.IncRef()
+ }
+
+ fs := &filesystem{
+ opts: fsopts,
+ creds: creds.Fork(),
+ dirDevMinor: dirDevMinor,
+ lowerDevMinors: make(map[layerDevNumber]uint32),
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ root := fs.newDentry()
+ root.refs = 1
+ if fs.opts.UpperRoot.Ok() {
+ fs.opts.UpperRoot.IncRef()
+ root.copiedUp = 1
+ root.upperVD = fs.opts.UpperRoot
+ }
+ for _, lowerRoot := range fs.opts.LowerRoots {
+ lowerRoot.IncRef()
+ root.lowerVDs = append(root.lowerVDs, lowerRoot)
+ }
+ rootTopVD := root.topLayer()
+ // Get metadata from the topmost layer. See fs.lookupLocked().
+ const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: rootTopVD,
+ Start: rootTopVD,
+ }, &vfs.StatOptions{
+ Mask: rootStatMask,
+ })
+ if err != nil {
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
+ }
+ if rootStat.Mask&rootStatMask != rootStatMask {
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, syserror.EREMOTE
+ }
+ if isWhiteout(&rootStat) {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, syserror.EINVAL
+ }
+ root.mode = uint32(rootStat.Mode)
+ root.uid = rootStat.UID
+ root.gid = rootStat.GID
+ if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
+ root.devMajor = linux.UNNAMED_MAJOR
+ root.devMinor = fs.dirDevMinor
+ root.ino = fs.newDirIno()
+ } else if !root.upperVD.Ok() {
+ root.devMajor = linux.UNNAMED_MAJOR
+ rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor)
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
+ }
+ root.devMinor = rootDevMinor
+ root.ino = rootStat.Ino
+ } else {
+ root.devMajor = rootStat.DevMajor
+ root.devMinor = rootStat.DevMinor
+ root.ino = rootStat.Ino
+ }
+
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
+// associated with any MountNamespace, and returns the root of the new mount.
+// (This is required to ensure that each layer of an overlay comprises only a
+// single mount, and therefore can't cross into e.g. the overlay filesystem
+// itself, risking lock recursion.) A reference is held on the returned
+// VirtualDentry.
+func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
+ oldmnt := vd.Mount()
+ opts := oldmnt.Options()
+ if forceReadOnly {
+ opts.ReadOnly = true
+ }
+ newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
+ if err != nil {
+ return vfs.VirtualDentry{}, err
+ }
+ // Take a reference on the dentry which will be owned by the returned
+ // VirtualDentry.
+ d := vd.Dentry()
+ d.IncRef()
+ return vfs.MakeVirtualDentry(newmnt, d), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
+ for _, lowerDevMinor := range fs.lowerDevMinors {
+ vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+ }
+ if fs.opts.UpperRoot.Ok() {
+ fs.opts.UpperRoot.DecRef(ctx)
+ }
+ for _, lowerRoot := range fs.opts.LowerRoots {
+ lowerRoot.DecRef(ctx)
+ }
+}
+
+func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
+ // Always statfs the root of the topmost layer. Compare Linux's
+ // fs/overlayfs/super.c:ovl_statfs().
+ var rootVD vfs.VirtualDentry
+ if fs.opts.UpperRoot.Ok() {
+ rootVD = fs.opts.UpperRoot
+ } else {
+ rootVD = fs.opts.LowerRoots[0]
+ }
+ fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: rootVD,
+ Start: rootVD,
+ })
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
+ return fsstat, nil
+}
+
+func (fs *filesystem) newDirIno() uint64 {
+ return atomic.AddUint64(&fs.lastDirIno, 1)
+}
+
+func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
+ fs.devMu.Lock()
+ defer fs.devMu.Unlock()
+ orig := layerDevNumber{layerMajor, layerMinor}
+ if minor, ok := fs.lowerDevMinors[orig]; ok {
+ return minor, nil
+ }
+ minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
+ if err != nil {
+ return 0, err
+ }
+ fs.lowerDevMinors[orig] = minor
+ return minor, nil
+}
+
+// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
+type dentry struct {
+ vfsd vfs.Dentry
+
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // mode, uid, and gid are the file mode, owner, and group of the file in
+ // the topmost layer (and therefore the overlay file as well), and are used
+ // for permission checks on this dentry. These fields are protected by
+ // copyMu and accessed using atomic memory operations.
+ mode uint32
+ uid uint32
+ gid uint32
+
+ // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
+ // 0 otherwise. copiedUp is accessed using atomic memory operations.
+ copiedUp uint32
+
+ // parent is the dentry corresponding to this dentry's parent directory.
+ // name is this dentry's name in parent. If this dentry is a filesystem
+ // root, parent is nil and name is the empty string. parent and name are
+ // protected by fs.renameMu.
+ parent *dentry
+ name string
+
+ // If this dentry represents a directory, children maps the names of
+ // children for which dentries have been instantiated to those dentries,
+ // and dirents (if not nil) is a cache of dirents as returned by
+ // directoryFDs representing this directory. children is protected by
+ // dirMu.
+ dirMu sync.Mutex `state:"nosave"`
+ children map[string]*dentry
+ dirents []vfs.Dirent
+
+ // upperVD and lowerVDs are the files from the overlay filesystem's layers
+ // that comprise the file on the overlay filesystem.
+ //
+ // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
+ // be copied up) with copyMu locked for writing; otherwise, it is
+ // immutable. lowerVDs is always immutable.
+ copyMu sync.RWMutex `state:"nosave"`
+ upperVD vfs.VirtualDentry
+ lowerVDs []vfs.VirtualDentry
+
+ // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
+ // len(inlineLowerVDs).
+ inlineLowerVDs [1]vfs.VirtualDentry
+
+ // devMajor, devMinor, and ino are the device major/minor and inode numbers
+ // used by this dentry. These fields are protected by copyMu and accessed
+ // using atomic memory operations.
+ devMajor uint32
+ devMinor uint32
+ ino uint64
+
+ // If this dentry represents a regular file, then:
+ //
+ // - mapsMu is used to synchronize between copy-up and memmap.Mappable
+ // methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+ //
+ // - dataMu is used to synchronize between copy-up and
+ // dentry.(memmap.Mappable).Translate.
+ //
+ // - lowerMappings tracks memory mappings of the file. lowerMappings is
+ // used to invalidate mappings of the lower layer when the file is copied
+ // up to ensure that they remain coherent with subsequent writes to the
+ // file. (Note that, as of this writing, Linux overlayfs does not do this;
+ // this feature is a gVisor extension.) lowerMappings is protected by
+ // mapsMu.
+ //
+ // - If this dentry is copied-up, then wrappedMappable is the Mappable
+ // obtained from a call to the current top layer's
+ // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+ // (from a call to regularFileFD.ensureMappable()), it cannot become nil.
+ // wrappedMappable is protected by mapsMu and dataMu.
+ //
+ // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+ // accessed using atomic memory operations.
+ mapsMu sync.Mutex `state:"nosave"`
+ lowerMappings memmap.MappingSet
+ dataMu sync.RWMutex `state:"nosave"`
+ wrappedMappable memmap.Mappable
+ isMappable uint32
+
+ locks vfs.FileLocks
+
+ // watches is the set of inotify watches on the file repesented by this dentry.
+ //
+ // Note that hard links to the same file will not share the same set of
+ // watches, due to the fact that we do not have inode structures in this
+ // overlay implementation.
+ watches vfs.Watches
+}
+
+// newDentry creates a new dentry. The dentry initially has no references; it
+// is the caller's responsibility to set the dentry's reference count and/or
+// call dentry.destroy() as appropriate. The dentry is initially invalid in
+// that it contains no layers; the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+ d := &dentry{
+ fs: fs,
+ }
+ d.lowerVDs = d.inlineLowerVDs[:0]
+ d.vfsd.Init(d)
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Register(d, "overlay.dentry")
+ }
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ // d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+ // d.checkDropLocked().
+ atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&d.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+ if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ } else if refs < 0 {
+ panic("overlay.dentry.DecRef() called without holding a reference")
+ }
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+ // Dentries with a positive reference count must be retained. (The only way
+ // to obtain a reference on a dentry with zero references is via path
+ // resolution, which requires renameMu, so if d.refs is zero then it will
+ // remain zero while we hold renameMu for writing.) Dentries with a
+ // negative reference count have already been destroyed.
+ if atomic.LoadInt64(&d.refs) != 0 {
+ return
+ }
+
+ // Make sure that we do not lose watches on dentries that have not been
+ // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
+ // d.vfsd.IsDead() indicates that d was deleted.
+ if !d.vfsd.IsDead() && d.watches.Size() > 0 {
+ return
+ }
+
+ // Refs is still zero; destroy it.
+ d.destroyLocked(ctx)
+ return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("overlay.dentry.destroyLocked() called with references on the dentry")
+ }
+
+ if d.upperVD.Ok() {
+ d.upperVD.DecRef(ctx)
+ }
+ for _, lowerVD := range d.lowerVDs {
+ lowerVD.DecRef(ctx)
+ }
+
+ d.watches.HandleDeletion(ctx)
+
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ if !d.vfsd.IsDead() {
+ delete(d.parent.children, d.name)
+ }
+ d.parent.dirMu.Unlock()
+ // Drop the reference held by d on its parent without recursively
+ // locking d.fs.renameMu.
+ if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+ d.parent.checkDropLocked(ctx)
+ } else if refs < 0 {
+ panic("overlay.dentry.DecRef() called without holding a reference")
+ }
+ }
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Unregister(d, "overlay.dentry")
+ }
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+ return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
+ if d.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+ // that d was deleted.
+ deleted := d.vfsd.IsDead()
+
+ d.fs.renameMu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+ }
+ d.watches.Notify(ctx, "", events, cookie, et, deleted)
+ d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+ if atomic.LoadInt64(&d.refs) == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ }
+}
+
+// iterLayers invokes yield on each layer comprising d, from top to bottom. If
+// any call to yield returns false, iterLayer stops iteration.
+func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
+ if d.isCopiedUp() {
+ if !yield(d.upperVD, true) {
+ return
+ }
+ }
+ for _, lowerVD := range d.lowerVDs {
+ if !yield(lowerVD, false) {
+ return
+ }
+ }
+}
+
+func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
+ if d.isCopiedUp() {
+ return d.upperVD, true
+ }
+ return d.lowerVDs[0], false
+}
+
+func (d *dentry) topLayer() vfs.VirtualDentry {
+ vd, _ := d.topLayerInfo()
+ return vd
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
+// statInternalMask is the set of stat fields that is set by
+// dentry.statInternalTo().
+const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+
+// statInternalTo writes fields to stat that are stored in d, and therefore do
+// not requiring invoking StatAt on the overlay's layers.
+func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
+ stat.Mask |= statInternalMask
+ if d.isDir() {
+ // Linux sets nlink to 1 for merged directories
+ // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
+ // correct more often ("." and the directory's entry in its parent),
+ // and some of our tests expect this.
+ stat.Nlink = 2
+ }
+ stat.UID = atomic.LoadUint32(&d.uid)
+ stat.GID = atomic.LoadUint32(&d.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+ stat.Ino = atomic.LoadUint64(&d.ino)
+ stat.DevMajor = atomic.LoadUint32(&d.devMajor)
+ stat.DevMinor = atomic.LoadUint32(&d.devMinor)
+}
+
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
+ if opts.Stat.Mask&linux.STATX_MODE != 0 {
+ atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
+ }
+ if opts.Stat.Mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&d.uid, opts.Stat.UID)
+ }
+ if opts.Stat.Mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&d.gid, opts.Stat.GID)
+ }
+}
+
+// fileDescription is embedded by overlay implementations of
+// vfs.FileDescriptionImpl.
+//
+// +stateify savable
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+ fs := fd.filesystem()
+ d := fd.dentry()
+
+ fs.renameMu.RLock()
+ err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+ fs.renameMu.RUnlock()
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+ fs := fd.filesystem()
+ d := fd.dentry()
+
+ fs.renameMu.RLock()
+ err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+ fs.renameMu.RUnlock()
+ if err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go
new file mode 100644
index 000000000..2b89a7a6d
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -0,0 +1,456 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isRegularFile() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG
+}
+
+func (d *dentry) isSymlink() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+ layerVD := d.topLayer()
+ return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ })
+}
+
+// +stateify savable
+type regularFileFD struct {
+ fileDescription
+
+ // If copiedUp is false, cachedFD represents
+ // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+ // fileDescription.dentry().upperVD. cachedFlags is the last known value of
+ // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+ // protected by mu.
+ mu sync.Mutex `state:"nosave"`
+ copiedUp bool
+ cachedFD *vfs.FileDescription
+ cachedFlags uint32
+
+ // If copiedUp is false, lowerWaiters contains all waiter.Entries
+ // registered with cachedFD. lowerWaiters is protected by mu.
+ lowerWaiters map[*waiter.Entry]waiter.EventMask
+}
+
+func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return nil, err
+ }
+ wrappedFD.IncRef()
+ return wrappedFD, nil
+}
+
+func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+ d := fd.dentry()
+ statusFlags := fd.vfsfd.StatusFlags()
+ if !fd.copiedUp && d.isCopiedUp() {
+ // Switch to the copied-up file.
+ upperVD := d.topLayer()
+ upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: upperVD,
+ Start: upperVD,
+ }, &vfs.OpenOptions{
+ Flags: statusFlags,
+ })
+ if err != nil {
+ return nil, err
+ }
+ oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+ if oldOffErr == nil {
+ if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+ upperFD.DecRef(ctx)
+ return nil, err
+ }
+ }
+ if len(fd.lowerWaiters) != 0 {
+ ready := upperFD.Readiness(^waiter.EventMask(0))
+ for e, mask := range fd.lowerWaiters {
+ fd.cachedFD.EventUnregister(e)
+ upperFD.EventRegister(e, mask)
+ if ready&mask != 0 {
+ e.Callback.Callback(e)
+ }
+ }
+ }
+ fd.cachedFD.DecRef(ctx)
+ fd.copiedUp = true
+ fd.cachedFD = upperFD
+ fd.cachedFlags = statusFlags
+ fd.lowerWaiters = nil
+ } else if fd.cachedFlags != statusFlags {
+ if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+ return nil, err
+ }
+ fd.cachedFlags = statusFlags
+ }
+ return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(ctx context.Context) {
+ fd.cachedFD.DecRef(ctx)
+ fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+ // Linux doesn't define ovl_file_operations.flush at all (i.e. its
+ // equivalent to OnClose is a no-op). We pass through to
+ // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+ // copied-up, since OnClose is mostly used to define post-close writeback,
+ // and if fd.cachedFD hasn't been updated then it can't have been used to
+ // mutate fd.dentry() anyway.
+ fd.mu.Lock()
+ if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+ if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+ fd.mu.Unlock()
+ return err
+ }
+ fd.cachedFlags = statusFlags
+ }
+ wrappedFD := fd.cachedFD
+ fd.mu.Unlock()
+ return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+ Mask: layerMask,
+ Sync: opts.Sync,
+ })
+ wrappedFD.DecRef(ctx)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ fd.dentry().statInternalTo(ctx, &opts, &stat)
+ return stat, nil
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ d := fd.dentry()
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ return err
+ }
+ mnt := fd.vfsfd.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // Changes to d's attributes are serialized by d.copyMu.
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return err
+ }
+ if err := wrappedFD.SetStat(ctx, opts); err != nil {
+ return err
+ }
+ d.updateAfterSetStatLocked(&opts)
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+ return fd.filesystem().statFS(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ctx := context.Background()
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ // TODO(b/171089913): Just use fd.cachedFD since Readiness can't return
+ // an error. This is obviously wrong, but at least consistent with
+ // VFS1.
+ log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err)
+ fd.mu.Lock()
+ wrappedFD = fd.cachedFD
+ wrappedFD.IncRef()
+ fd.mu.Unlock()
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(context.Background())
+ if err != nil {
+ // TODO(b/171089913): Just use fd.cachedFD since EventRegister can't
+ // return an error. This is obviously wrong, but at least consistent
+ // with VFS1.
+ log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err)
+ wrappedFD = fd.cachedFD
+ }
+ wrappedFD.EventRegister(e, mask)
+ if !fd.copiedUp {
+ if fd.lowerWaiters == nil {
+ fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask)
+ }
+ fd.lowerWaiters[e] = mask
+ }
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *regularFileFD) EventUnregister(e *waiter.Entry) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ fd.cachedFD.EventUnregister(e)
+ if !fd.copiedUp {
+ delete(fd.lowerWaiters, e)
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Hold fd.mu during the read to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ // Hold fd.mu during the write to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Hold fd.mu during the seek to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ fd.mu.Lock()
+ if !fd.dentry().isCopiedUp() {
+ fd.mu.Unlock()
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ fd.mu.Unlock()
+ return err
+ }
+ wrappedFD.IncRef()
+ defer wrappedFD.DecRef(ctx)
+ fd.mu.Unlock()
+ return wrappedFD.Sync(ctx)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Ioctl(ctx, uio, args)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ if err := fd.ensureMappable(ctx, opts); err != nil {
+ return err
+ }
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+ d := fd.dentry()
+
+ // Fast path if we already have a Mappable for the current top layer.
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+
+ // Only permit mmap of regular files, since other file types may have
+ // unpredictable behavior when mmapped (e.g. /dev/zero).
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+ return syserror.ENODEV
+ }
+
+ // Get a Mappable for the current top layer.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ d.copyMu.RLock()
+ defer d.copyMu.RUnlock()
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return err
+ }
+ if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+ return err
+ }
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef(ctx)
+ opts.MappingIdentity = nil
+ }
+ // Use this Mappable for all mappings of this layer (unless we raced with
+ // another call to ensureMappable).
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ if d.wrappedMappable == nil {
+ d.wrappedMappable = opts.Mappable
+ atomic.StoreUint32(&d.isMappable, 1)
+ }
+ return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, ar, offset, writable)
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+ if !d.isCopiedUp() {
+ d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+ }
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+ }
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ d.dataMu.RLock()
+ defer d.dataMu.RUnlock()
+ return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ return d.wrappedMappable.InvalidateUnsavable(ctx)
+}
diff --git a/pkg/sentry/fsimpl/overlay/save_restore.go b/pkg/sentry/fsimpl/overlay/save_restore.go
new file mode 100644
index 000000000..054e17b17
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+ if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+ refsvfs2.Register(d, "overlay.dentry")
+ }
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index cab771211..e44b79b68 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type filesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -38,11 +39,15 @@ func (filesystemType) Name() string {
return "pipefs"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("pipefs.filesystemType.GetFilesystem should never be called")
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -63,9 +68,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -76,12 +81,15 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
kernfs.InodeNoopRefCount
- pipe *pipe.VFSPipe
+ locks vfs.FileLocks
+ pipe *pipe.VFSPipe
ino uint64
uid auth.KUID
@@ -114,7 +122,7 @@ func (i *inode) Mode() linux.FileMode {
}
// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
return linux.Statx{
Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
@@ -142,12 +150,14 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
return syserror.EPERM
}
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
// Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
}
// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
@@ -158,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
fs := mnt.Filesystem().Impl().(*filesystem)
inode := newInode(ctx, fs)
var d kernfs.Dentry
- d.Init(inode)
- defer d.DecRef()
+ d.Init(&fs.Filesystem, inode)
+ defer d.DecRef(ctx)
return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 17c1342b5..5196a2a80 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "fd_dir_inode_refs",
+ out = "fd_dir_inode_refs.go",
+ package = "proc",
+ prefix = "fdDirInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "fdDirInode",
+ },
+)
+
+go_template_instance(
+ name = "fd_info_dir_inode_refs",
+ out = "fd_info_dir_inode_refs.go",
+ package = "proc",
+ prefix = "fdInfoDirInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "fdInfoDirInode",
+ },
+)
+
+go_template_instance(
+ name = "subtasks_inode_refs",
+ out = "subtasks_inode_refs.go",
+ package = "proc",
+ prefix = "subtasksInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "subtasksInode",
+ },
+)
+
+go_template_instance(
+ name = "task_inode_refs",
+ out = "task_inode_refs.go",
+ package = "proc",
+ prefix = "taskInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "taskInode",
+ },
+)
+
+go_template_instance(
+ name = "tasks_inode_refs",
+ out = "tasks_inode_refs.go",
+ package = "proc",
+ prefix = "tasksInode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "tasksInode",
+ },
+)
+
go_library(
name = "proc",
srcs = [
+ "fd_dir_inode_refs.go",
+ "fd_info_dir_inode_refs.go",
"filesystem.go",
"subtasks.go",
+ "subtasks_inode_refs.go",
"task.go",
"task_fds.go",
"task_files.go",
+ "task_inode_refs.go",
"task_net.go",
"tasks.go",
"tasks_files.go",
+ "tasks_inode_refs.go",
"tasks_sys.go",
],
visibility = ["//pkg/sentry:internal"],
@@ -21,7 +82,9 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsbridge",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/inet",
@@ -35,8 +98,10 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
+ "//pkg/tcpip/network/ipv4",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 609210253..99abcab66 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -17,6 +17,7 @@ package proc
import (
"fmt"
+ "strconv"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -24,23 +25,29 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
-// Name is the default filesystem name.
-const Name = "proc"
+const (
+ // Name is the default filesystem name.
+ Name = "proc"
+ defaultMaxCachedDentries = uint64(1000)
+)
// FilesystemType is the factory class for procfs.
//
// +stateify savable
type FilesystemType struct{}
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -61,9 +68,22 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
if err != nil {
return nil, nil, err
}
+
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ maxCachedDentries := defaultMaxCachedDentries
+ if str, ok := mopts["dentry_cache_limit"]; ok {
+ delete(mopts, "dentry_cache_limit")
+ maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+ return nil, nil, syserror.EINVAL
+ }
+ }
+
procfs := &filesystem{
devMinor: devMinor,
}
+ procfs.MaxCachedDentries = maxCachedDentries
procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
var cgroups map[string]string
@@ -72,33 +92,35 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
cgroups = data.Cgroups
}
- _, dentry := procfs.newTasksInode(k, pidns, cgroups)
+ inode := procfs.newTasksInode(ctx, k, pidns, cgroups)
+ var dentry kernfs.Dentry
+ dentry.Init(&procfs.Filesystem, inode)
return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// dynamicInode is an overfitted interface for common Inodes with
// dynamicByteSource types used in procfs.
+//
+// +stateify savable
type dynamicInode interface {
kernfs.Inode
vfs.DynamicBytesSource
- Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+ Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
}
-func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
+ inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
+ return inode
}
+// +stateify savable
type staticFile struct {
kernfs.DynamicBytesFile
vfs.StaticData
@@ -110,8 +132,24 @@ func newStaticFile(data string) *staticFile {
return &staticFile{StaticData: vfs.StaticData{Data: data}}
}
+func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
+ return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
+}
+
// InternalData contains internal data passed in to the procfs mount via
// vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
type InternalData struct {
Cgroups map[string]string
}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 36a911db4..cb3c5e0fd 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,11 +31,16 @@ import (
//
// +stateify savable
type subtasksInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ implStatFS
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
- kernfs.AlwaysValid
+ subtasksInodeRefs
+
+ locks vfs.FileLocks
fs *filesystem
task *kernel.Task
@@ -45,7 +50,7 @@ type subtasksInode struct {
var _ kernfs.Inode = (*subtasksInode)(nil)
-func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
subInode := &subtasksInode{
fs: fs,
task: task,
@@ -53,18 +58,16 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
cgroupControllers: cgroupControllers,
}
// Note: credentials are overridden by taskOwnedInode.
- subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ subInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: subInode, owner: task}
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
-
- return dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
@@ -77,13 +80,11 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
if subTask.ThreadGroup() != i.task.ThreadGroup() {
return nil, syserror.ENOENT
}
-
- subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
- return subTaskDentry.VFSDentry(), nil
+ return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
}
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
return offset, syserror.ENOENT
@@ -113,6 +114,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
return offset, nil
}
+// +stateify savable
type subtasksFD struct {
kernfs.GenericDirectoryFD
@@ -126,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac
return fd.GenericDirectoryFD.IterDirents(ctx, cb)
}
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
if fd.task.ExitState() >= kernel.TaskExitZombie {
return 0, syserror.ENOENT
@@ -150,21 +152,23 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
return fd.GenericDirectoryFD.SetStat(ctx, opts)
}
-// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &subtasksFD{task: i.task}
- if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+ if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ }); err != nil {
return nil, err
}
- if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.InodeAttrs.Stat(vsfs, opts)
+// Stat implements kernfs.Inode.Stat.
+func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -174,7 +178,12 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.
return stat, nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *subtasksInode) DecRef(ctx context.Context) {
+ i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 482055db1..19011b010 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,91 +32,108 @@ import (
//
// +stateify savable
type taskInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
- kernfs.InodeNoDynamicLookup
+ implStatFS
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
+ taskInodeRefs
+
+ locks vfs.FileLocks
task *kernel.Task
}
var _ kernfs.Inode = (*taskInode)(nil)
-func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
- // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
- contents := map[string]*kernfs.Dentry{
- "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
- "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) {
+ if task.ExitState() == kernel.TaskExitDead {
+ return nil, syserror.ESRCH
+ }
+
+ contents := map[string]kernfs.Inode{
+ "auxv": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}),
+ "cmdline": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
"comm": fs.newComm(task, fs.NextIno(), 0444),
- "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+ "cwd": fs.newCwdSymlink(task, fs.NextIno()),
+ "environ": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
"exe": fs.newExeSymlink(task, fs.NextIno()),
"fd": fs.newFDDirInode(task),
"fdinfo": fs.newFDInfoDirInode(task),
- "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
- "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
- "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
- "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
- "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+ "gid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+ "io": fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+ "maps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}),
+ "mem": fs.newMemInode(task, fs.NextIno(), 0400),
+ "mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+ "mounts": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}),
"net": fs.newTaskNetDir(task),
- "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+ "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{
"net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
"pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
}),
- "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
- "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
- "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
- "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
- "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
- "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
- "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+ "oom_score": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+ "oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+ "smaps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}),
+ "stat": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+ "statm": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}),
+ "status": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+ "uid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
}
if isThreadGroup {
contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
}
if len(cgroupControllers) > 0 {
- contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+ contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
}
taskInode := &taskInode{task: task}
// Note: credentials are overridden by taskOwnedInode.
- taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ taskInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: taskInode, owner: task}
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := taskInode.OrderedChildren.Populate(dentry, contents)
+ links := taskInode.OrderedChildren.Populate(contents)
taskInode.IncLinks(links)
- return dentry
+ return inode, nil
}
-// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// Valid implements kernfs.Inode.Valid. This inode remains valid as long
// as the task is still running. When it's dead, another tasks with the same
// PID could replace it.
func (i *taskInode) Valid(ctx context.Context) bool {
return i.task.ExitState() != kernel.TaskExitDead
}
-// Open implements kernfs.Inode.
-func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *taskInode) DecRef(ctx context.Context) {
+ i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
// effective user and group.
+//
+// +stateify savable
type taskOwnedInode struct {
kernfs.Inode
@@ -126,36 +143,28 @@ type taskOwnedInode struct {
var _ kernfs.Inode = (*taskOwnedInode)(nil)
-func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
// Note: credentials are overridden by taskOwnedInode.
- inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
+ inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
- taskInode := &taskOwnedInode{Inode: inode, owner: task}
- d := &kernfs.Dentry{}
- d.Init(taskInode)
- return d
+ return &taskOwnedInode{Inode: inode, owner: task}
}
-func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
- dir := &kernfs.StaticDirectory{}
-
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
// Note: credentials are overridden by taskOwnedInode.
- dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
-
- inode := &taskOwnedInode{Inode: dir, owner: task}
- d := &kernfs.Dentry{}
- d.Init(inode)
+ fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
+ dir := kernfs.NewStaticDir(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
- dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := dir.OrderedChildren.Populate(d, children)
- dir.IncLinks(links)
+ return &taskOwnedInode{Inode: dir, owner: task}
+}
- return d
+func (i *taskOwnedInode) Valid(ctx context.Context) bool {
+ return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
}
-// Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.Inode.Stat(fs, opts)
+// Stat implements kernfs.Inode.Stat.
+func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.Inode.Stat(ctx, fs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -171,7 +180,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S
return stat, nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
mode := i.Mode()
uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 44ccc9e4a..d268b44be 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -43,16 +42,19 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags)
return file, flags
}
-func taskFDExists(t *kernel.Task, fd int32) bool {
+func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
file, _ := getTaskFD(t, fd)
if file == nil {
return false
}
- file.DecRef()
+ file.DecRef(ctx)
return true
}
+// +stateify savable
type fdDir struct {
+ locks vfs.FileLocks
+
fs *filesystem
task *kernel.Task
@@ -61,16 +63,15 @@ type fdDir struct {
produceSymlink bool
}
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.GetFDs()
+ fds = fdTable.GetFDs(ctx)
}
})
- offset := absOffset + relOffset
typ := uint8(linux.DT_REG)
if i.produceSymlink {
typ = linux.DT_LNK
@@ -86,31 +87,39 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs
Name: strconv.FormatUint(uint64(fd), 10),
Type: typ,
Ino: i.fs.NextIno(),
- NextOff: offset + 1,
+ NextOff: int64(fd) + 3,
}
if err := cb.Handle(dirent); err != nil {
- return offset, err
+ // Getdents should iterate correctly despite mutation
+ // of fds, so we return the next fd to serialize plus
+ // 2 (which accounts for the "." and ".." tracked by
+ // kernfs) as the offset.
+ return int64(fd) + 2, err
}
- offset++
}
- return offset, nil
+ // We serialized them all. Next offset should be higher than last
+ // serialized fd.
+ return int64(fds[len(fds)-1]) + 3, nil
}
// fdDirInode represents the inode for /proc/[pid]/fd directory.
//
// +stateify savable
type fdDirInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ fdDir
+ fdDirInodeRefs
+ implStatFS
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
- kernfs.AlwaysValid
- fdDir
}
var _ kernfs.Inode = (*fdDirInode)(nil)
-func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
inode := &fdDirInode{
fdDir: fdDir{
fs: fs,
@@ -118,39 +127,42 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
produceSymlink: true,
},
}
- inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
+ inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ return inode
+}
- return dentry
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(i.task, fd) {
+ if !taskFDExists(ctx, i.task, fd) {
return nil, syserror.ENOENT
}
- taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
- return taskDentry.VFSDentry(), nil
+ return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil
}
-// Open implements kernfs.Inode.
-func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
//
// This is to match Linux, which uses a special permission handler to guarantee
// that a process can still access /proc/self/fd after it has executed
@@ -172,10 +184,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
return err
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdDirInode) DecRef(ctx context.Context) {
+ i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
//
// +stateify savable
type fdSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -186,26 +204,23 @@ type fdSymlink struct {
var _ kernfs.Inode = (*fdSymlink)(nil)
-func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
inode := &fdSymlink{
task: task,
fd: fd,
}
- inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+ return inode
}
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
return "", syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
root := vfs.RootFromContext(ctx)
- defer root.DecRef()
+ defer root.DecRef(ctx)
return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
}
@@ -214,75 +229,90 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
if file == nil {
return vfs.VirtualDentry{}, "", syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
vd := file.VirtualDentry()
vd.IncRef()
return vd, "", nil
}
+// Valid implements kernfs.Inode.Valid.
+func (s *fdSymlink) Valid(ctx context.Context) bool {
+ return taskFDExists(ctx, s.task, s.fd)
+}
+
// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
//
// +stateify savable
type fdInfoDirInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ fdDir
+ fdInfoDirInodeRefs
+ implStatFS
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
kernfs.OrderedChildren
- kernfs.AlwaysValid
- fdDir
}
var _ kernfs.Inode = (*fdInfoDirInode)(nil)
-func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
inode := &fdInfoDirInode{
fdDir: fdDir{
fs: fs,
task: task,
},
}
- inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
+ inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-
- return dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(i.task, fd) {
+ if !taskFDExists(ctx, i.task, fd) {
return nil, syserror.ENOENT
}
data := &fdInfoData{
task: i.task,
fd: fd,
}
- dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
- return dentry.VFSDentry(), nil
+ return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil
}
-// Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+// IterDirents implements Inode.IterDirents.
+func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+ return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdInfoDirInode) DecRef(ctx context.Context) {
+ i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
//
// +stateify savable
type fdInfoData struct {
kernfs.DynamicBytesFile
- refs.AtomicRefCount
task *kernel.Task
fd int32
@@ -296,7 +326,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if file == nil {
return syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// TODO(b/121266871): Include pos, locks, and other data. For now we only
// have flags.
// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
@@ -304,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "flags:\t0%o\n", flags)
return nil
}
+
+// Valid implements kernfs.Inode.Valid.
+func (d *fdInfoData) Valid(ctx context.Context) bool {
+ return taskFDExists(ctx, d.task, d.fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2f297e48a..ba71d0fde 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -30,10 +31,15 @@ import (
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
// mm gets the kernel task's MemoryManager. No additional reference is taken on
// mm here. This is safe because MemoryManager.destroy is required to leave the
// MemoryManager in a state where it's still usable as a DynamicBytesSource.
@@ -226,8 +232,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Linux will return envp up to and including the first NULL character,
// so find it.
- if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 {
- buf.Truncate(end)
+ envStart := int(ar.Length())
+ if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
+ buf.Truncate(envStart + nullIdx)
}
}
@@ -241,13 +248,10 @@ type commInode struct {
task *kernel.Task
}
-func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
inode := &commInode{task: task}
- inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ inode.DynamicBytesFile.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
+ return inode
}
func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -282,7 +286,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
-// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}.
+// idMapData implements vfs.WritableDynamicBytesSource for
+// /proc/[pid]/{gid_map|uid_map}.
//
// +stateify savable
type idMapData struct {
@@ -294,7 +299,7 @@ type idMapData struct {
var _ dynamicInode = (*idMapData)(nil)
-// Generate implements vfs.DynamicBytesSource.Generate.
+// Generate implements vfs.WritableDynamicBytesSource.Generate.
func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var entries []auth.IDMapEntry
if d.gids {
@@ -308,6 +313,216 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ // "In addition, the number of bytes written to the file must be less than
+ // the system page size, and the write must be performed at the start of
+ // the file ..." - user_namespaces(7)
+ srclen := src.NumBytes()
+ if srclen >= usermem.PageSize || offset != 0 {
+ return 0, syserror.EINVAL
+ }
+ b := make([]byte, srclen)
+ if _, err := src.CopyIn(ctx, b); err != nil {
+ return 0, err
+ }
+
+ // Truncate from the first NULL byte.
+ var nul int64
+ nul = int64(bytes.IndexByte(b, 0))
+ if nul == -1 {
+ nul = srclen
+ }
+ b = b[:nul]
+ // Remove the last \n.
+ if nul >= 1 && b[nul-1] == '\n' {
+ b = b[:nul-1]
+ }
+ lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
+ if len(lines) > maxIDMapLines {
+ return 0, syserror.EINVAL
+ }
+
+ entries := make([]auth.IDMapEntry, len(lines))
+ for i, l := range lines {
+ var e auth.IDMapEntry
+ _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
+ if err != nil {
+ return 0, syserror.EINVAL
+ }
+ entries[i] = e
+ }
+ var err error
+ if d.gids {
+ err = d.task.UserNamespace().SetGIDMap(ctx, entries)
+ } else {
+ err = d.task.UserNamespace().SetUIDMap(ctx, entries)
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // On success, Linux's kernel/user_namespace.c:map_write() always returns
+ // count, even if fewer bytes were used.
+ return int64(srclen), nil
+}
+
+var _ kernfs.Inode = (*memInode)(nil)
+
+// memInode implements kernfs.Inode for /proc/[pid]/mem.
+//
+// +stateify savable
+type memInode struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoStatFS
+ kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+
+ task *kernel.Task
+ locks vfs.FileLocks
+}
+
+func (fs *filesystem) newMemInode(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
+ // Note: credentials are overridden by taskOwnedInode.
+ inode := &memInode{task: task}
+ inode.init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
+ return &taskOwnedInode{Inode: inode, owner: task}
+}
+
+func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+ if perm&^linux.PermissionsMask != 0 {
+ panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+ }
+ f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+}
+
+// Open implements kernfs.Inode.Open.
+func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
+ // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
+ // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
+ if !kernel.ContextCanTrace(ctx, f.task, true) {
+ return nil, syserror.EACCES
+ }
+ if err := checkTaskState(f.task); err != nil {
+ return nil, err
+ }
+ fd := &memFD{}
+ if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+ return syserror.EPERM
+}
+
+var _ vfs.FileDescriptionImpl = (*memFD)(nil)
+
+// memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem.
+//
+// +stateify savable
+type memFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ inode *memInode
+
+ // mu guards the fields below.
+ mu sync.Mutex `state:"nosave"`
+ offset int64
+}
+
+// Init initializes memFD.
+func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error {
+ fd.LockFD.Init(&inode.locks)
+ if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return err
+ }
+ fd.inode = inode
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ case linux.SEEK_CUR:
+ offset += fd.offset
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.offset = offset
+ return offset, nil
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+ m, err := getMMIncRef(fd.inode.task)
+ if err != nil {
+ return 0, nil
+ }
+ defer m.DecUsers(ctx)
+ // Buffer the read data because of MM locks
+ buf := make([]byte, dst.NumBytes())
+ n, readErr := m.CopyIn(ctx, usermem.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
+ if n > 0 {
+ if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
+ return 0, syserror.EFAULT
+ }
+ return int64(n), nil
+ }
+ if readErr != nil {
+ return 0, syserror.EIO
+ }
+ return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.offset, opts)
+ fd.offset += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return fd.inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error {
+ return syserror.EPERM
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *memFD) Release(context.Context) {}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *memFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *memFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
+
// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
//
// +stateify savable
@@ -482,7 +697,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var vss, rss, data uint64
s.task.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
@@ -587,6 +802,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
//
// +stateify savable
type exeSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -596,29 +812,30 @@ type exeSymlink struct {
var _ kernfs.Inode = (*exeSymlink)(nil)
-func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
inode := &exeSymlink{task: task}
- inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+ inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+ return inode
}
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
- if !kernel.ContextCanTrace(ctx, s.task, false) {
- return "", syserror.EACCES
- }
-
- // Pull out the executable for /proc/[pid]/exe.
- exec, err := s.executable()
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+ exec, _, err := s.Getlink(ctx, nil)
if err != nil {
return "", err
}
- defer exec.DecRef()
+ defer exec.DecRef(ctx)
+
+ root := vfs.RootFromContext(ctx)
+ if !root.Ok() {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
- return exec.PathnameWithDeleted(ctx), nil
+ vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
+ name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
+ return name, nil
}
// Getlink implements kernfs.Inode.Getlink.
@@ -626,23 +843,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
if !kernel.ContextCanTrace(ctx, s.task, false) {
return vfs.VirtualDentry{}, "", syserror.EACCES
}
-
- exec, err := s.executable()
- if err != nil {
- return vfs.VirtualDentry{}, "", err
- }
- defer exec.DecRef()
-
- vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
- vd.IncRef()
- return vd, "", nil
-}
-
-func (s *exeSymlink) executable() (file fsbridge.File, err error) {
if err := checkTaskState(s.task); err != nil {
- return nil, err
+ return vfs.VirtualDentry{}, "", err
}
+ var err error
+ var exec fsbridge.File
s.task.WithMuLocked(func(t *kernel.Task) {
mm := t.MemoryManager()
if mm == nil {
@@ -653,12 +859,75 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
// The MemoryManager may be destroyed, in which case
// MemoryManager.destroy will simply set the executable to nil
// (with locks held).
- file = mm.Executable()
- if file == nil {
+ exec = mm.Executable()
+ if exec == nil {
err = syserror.ESRCH
}
})
- return
+ if err != nil {
+ return vfs.VirtualDentry{}, "", err
+ }
+ defer exec.DecRef(ctx)
+
+ vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+ vd.IncRef()
+ return vd, "", nil
+}
+
+// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
+//
+// +stateify savable
+type cwdSymlink struct {
+ implStatFS
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeSymlink
+
+ task *kernel.Task
+}
+
+var _ kernfs.Inode = (*cwdSymlink)(nil)
+
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
+ inode := &cwdSymlink{task: task}
+ inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+ return inode
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+ cwd, _, err := s.Getlink(ctx, nil)
+ if err != nil {
+ return "", err
+ }
+ defer cwd.DecRef(ctx)
+
+ root := vfs.RootFromContext(ctx)
+ if !root.Ok() {
+ // It could have raced with process deletion.
+ return "", syserror.ESRCH
+ }
+ defer root.DecRef(ctx)
+
+ vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
+ name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
+ return name, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ if !kernel.ContextCanTrace(ctx, s.task, false) {
+ return vfs.VirtualDentry{}, "", syserror.EACCES
+ }
+ if err := checkTaskState(s.task); err != nil {
+ return vfs.VirtualDentry{}, "", err
+ }
+ cwd := s.task.FSContext().WorkingDirectoryVFS2()
+ if !cwd.Ok() {
+ // It could have raced with process deletion.
+ return vfs.VirtualDentry{}, "", syserror.ESRCH
+ }
+ return cwd, "", nil
}
// mountInfoData is used to implement /proc/[pid]/mountinfo.
@@ -687,7 +956,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(ctx)
i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
return nil
}
@@ -718,18 +987,19 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(ctx)
i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
return nil
}
+// +stateify savable
type namespaceSymlink struct {
kernfs.StaticSymlink
task *kernel.Task
}
-func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode {
// Namespace symlinks should contain the namespace name and the inode number
// for the namespace instance, so for example user:[123456]. We currently fake
// the inode number by sticking the symlink inode in its place.
@@ -737,61 +1007,68 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
inode := &namespaceSymlink{task: task}
// Note: credentials are overridden by taskOwnedInode.
- inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
+ inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
taskInode := &taskOwnedInode{Inode: inode, owner: task}
- d := &kernfs.Dentry{}
- d.Init(taskInode)
- return d
+ return taskInode
}
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
if err := checkTaskState(s.task); err != nil {
return "", err
}
- return s.StaticSymlink.Readlink(ctx)
+ return s.StaticSymlink.Readlink(ctx, mnt)
}
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
if err := checkTaskState(s.task); err != nil {
return vfs.VirtualDentry{}, "", err
}
// Create a synthetic inode to represent the namespace.
+ fs := mnt.Filesystem().Impl().(*filesystem)
+ nsInode := &namespaceInode{}
+ nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
dentry := &kernfs.Dentry{}
- dentry.Init(&namespaceInode{})
+ dentry.Init(&fs.Filesystem, nsInode)
vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
- vd.IncRef()
- dentry.DecRef()
+ // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
+ mnt.IncRef()
return vd, "", nil
}
// namespaceInode is a synthetic inode created to represent a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceInode struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+
+ locks vfs.FileLocks
}
var _ kernfs.Inode = (*namespaceInode)(nil)
// Init initializes a namespace inode.
-func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
if perm&^linux.PermissionsMask != 0 {
panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
}
- i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+ i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}
-// Open implements Inode.Open.
-func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &namespaceFD{inode: i}
i.IncRef()
- if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ fd.LockFD.Init(&i.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -799,8 +1076,11 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
// namespace FD is a synthetic file that represents a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceFD struct {
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
vfsfd vfs.FileDescription
inode *namespaceInode
@@ -808,25 +1088,30 @@ type namespaceFD struct {
var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
- return fd.inode.Stat(vfs, opts)
+ return fd.inode.Stat(ctx, vfs, opts)
}
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
creds := auth.CredentialsFromContext(ctx)
return fd.inode.SetStat(ctx, vfs, creds, opts)
}
-// Release implements FileDescriptionImpl.
-func (fd *namespaceFD) Release() {
- fd.inode.DecRef()
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *namespaceFD) Release(ctx context.Context) {
+ fd.inode.DecRef(ctx)
}
-// OnClose implements FileDescriptionImpl.
-func (*namespaceFD) OnClose(context.Context) error {
- return nil
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 6bde27376..5a9ee111f 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -37,12 +37,12 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode {
k := task.Kernel()
pidns := task.PIDNamespace()
root := auth.NewRootCredentials(pidns.UserNamespace())
- var contents map[string]*kernfs.Dentry
+ var contents map[string]kernfs.Inode
if stack := task.NetworkNamespace().Stack(); stack != nil {
const (
arp = "IP address HW type Flags HW address Mask Device\n"
@@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
// network namespace.
- contents = map[string]*kernfs.Dentry{
- "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
- "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+ contents = map[string]kernfs.Inode{
+ "dev": fs.newInode(task, root, 0444, &netDevData{stack: stack}),
+ "snmp": fs.newInode(task, root, 0444, &netSnmpData{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, if the file contains a header the stub is just the header
// otherwise it is an empty file.
- "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
- "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
- "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
- "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
- "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+ "arp": fs.newInode(task, root, 0444, newStaticFile(arp)),
+ "netlink": fs.newInode(task, root, 0444, newStaticFile(netlink)),
+ "netstat": fs.newInode(task, root, 0444, &netStatData{}),
+ "packet": fs.newInode(task, root, 0444, newStaticFile(packet)),
+ "protocols": fs.newInode(task, root, 0444, newStaticFile(protocols)),
// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
// high res timer ticks per sec (ClockGetres returns 1ns resolution).
- "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
- "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
- "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
- "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
- "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
- "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+ "psched": fs.newInode(task, root, 0444, newStaticFile(psched)),
+ "ptype": fs.newInode(task, root, 0444, newStaticFile(ptype)),
+ "route": fs.newInode(task, root, 0444, &netRouteData{stack: stack}),
+ "tcp": fs.newInode(task, root, 0444, &netTCPData{kernel: k}),
+ "udp": fs.newInode(task, root, 0444, &netUDPData{kernel: k}),
+ "unix": fs.newInode(task, root, 0444, &netUnixData{kernel: k}),
}
if stack.SupportsIPv6() {
- contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
- contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
- contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
- contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+ contents["if_inet6"] = fs.newInode(task, root, 0444, &ifinet6{stack: stack})
+ contents["ipv6_route"] = fs.newInode(task, root, 0444, newStaticFile(""))
+ contents["tcp6"] = fs.newInode(task, root, 0444, &netTCP6Data{kernel: k})
+ contents["udp6"] = fs.newInode(task, root, 0444, newStaticFile(upd6))
}
}
@@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
continue
}
if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
- s.DecRef()
+ s.DecRef(ctx)
// Not a unix socket.
continue
}
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// For now, we always redact this pointer.
fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
- s.Refs()-1, // RefCount, don't count our own ref.
+ s.ReadRefs()-1, // RefCount, don't count our own ref.
0, // Protocol, always 0 for UDS.
sockFlags, // Flags.
sops.Endpoint().Type(), // Type.
@@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
}
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
}
@@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
}
if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
- s.DecRef()
+ s.DecRef(ctx)
// Not tcp4 sockets.
continue
}
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
// Field: refcount. Don't count the ref we obtain while deferencing
// the weakref to this socket.
- fmt.Fprintf(buf, "%d ", s.Refs()-1)
+ fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
// Field: Socket struct address. Redacted due to the same reason as
// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
@@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
}
if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
- s.DecRef()
+ s.DecRef(ctx)
// Not udp4 socket.
continue
}
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Field: ref; reference count on the socket inode. Don't count the ref
// we obtain while deferencing the weakref to this socket.
- fmt.Fprintf(buf, "%d ", s.Refs()-1)
+ fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
// Field: Socket struct address. Redacted due to the same reason as
// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
}
@@ -616,6 +616,7 @@ type netSnmpData struct {
var _ dynamicInode = (*netSnmpData)(nil)
+// +stateify savable
type snmpLine struct {
prefix string
header string
@@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string {
return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
}
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
types := []interface{}{
&inet.StatSNMPIP{},
@@ -709,7 +710,7 @@ type netRouteData struct {
var _ dynamicInode = (*netRouteData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +774,7 @@ type netStatData struct {
var _ dynamicInode = (*netStatData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b51d43954..b81ea14bf 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,19 +37,22 @@ const (
//
// +stateify savable
type tasksInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ implStatFS
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
kernfs.OrderedChildren
- kernfs.AlwaysValid
+ tasksInodeRefs
+
+ locks vfs.FileLocks
fs *filesystem
pidns *kernel.PIDNamespace
// '/proc/self' and '/proc/thread-self' have custom directory offsets in
// Linux. So handle them outside of OrderedChildren.
- selfSymlink *vfs.Dentry
- threadSelfSymlink *vfs.Dentry
// cgroupControllers is a map of controller name to directory in the
// cgroup hierarchy. These controllers are immutable and will be listed
@@ -59,51 +62,53 @@ type tasksInode struct {
var _ kernfs.Inode = (*tasksInode)(nil)
-func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode {
root := auth.NewRootCredentials(pidns.UserNamespace())
- contents := map[string]*kernfs.Dentry{
- "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
- "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
- "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
- "sys": fs.newSysDir(root, k),
- "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
- "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
- "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
- "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
- "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
- "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+ contents := map[string]kernfs.Inode{
+ "cpuinfo": fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
+ "filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}),
+ "loadavg": fs.newInode(ctx, root, 0444, &loadavgData{}),
+ "sys": fs.newSysDir(ctx, root, k),
+ "meminfo": fs.newInode(ctx, root, 0444, &meminfoData{}),
+ "mounts": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
+ "net": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
+ "stat": fs.newInode(ctx, root, 0444, &statData{}),
+ "uptime": fs.newInode(ctx, root, 0444, &uptimeData{}),
+ "version": fs.newInode(ctx, root, 0444, &versionData{}),
}
inode := &tasksInode{
pidns: pidns,
fs: fs,
- selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
- threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
cgroupControllers: cgroupControllers,
}
- inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
- dentry := &kernfs.Dentry{}
- dentry.Init(inode)
+ inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- links := inode.OrderedChildren.Populate(dentry, contents)
+ links := inode.OrderedChildren.Populate(contents)
inode.IncLinks(links)
- return inode, dentry
+ return inode
}
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
- // Try to lookup a corresponding task.
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ // Check if a static entry was looked up.
+ if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+ return d, nil
+ }
+
+ // Not a static entry. Try to lookup a corresponding task.
tid, err := strconv.ParseUint(name, 10, 64)
if err != nil {
+ root := auth.NewRootCredentials(i.pidns.UserNamespace())
// If it failed to parse, check if it's one of the special handled files.
switch name {
case selfName:
- return i.selfSymlink, nil
+ return i.newSelfSymlink(ctx, root), nil
case threadSelfName:
- return i.threadSelfSymlink, nil
+ return i.newThreadSelfSymlink(ctx, root), nil
}
return nil, syserror.ENOENT
}
@@ -113,12 +118,11 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return nil, syserror.ENOENT
}
- taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
- return taskDentry.VFSDentry(), nil
+ return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
}
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
const FIRST_PROCESS_ENTRY = 256
@@ -195,17 +199,19 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
return maxTaskID, nil
}
-// Open implements kernfs.Inode.
-func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -222,9 +228,16 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Sta
return stat, nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *tasksInode) DecRef(ctx context.Context) {
+ i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
// staticFileSetStat implements a special static file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type staticFileSetStat struct {
dynamicBytesFileSetAttr
vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..01b7a6678 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,7 +31,9 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type selfSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -41,16 +43,13 @@ type selfSymlink struct {
var _ kernfs.Inode = (*selfSymlink)(nil)
-func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
- inode := &selfSymlink{pidns: pidns}
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+ inode := &selfSymlink{pidns: i.pidns}
+ inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+ return inode
}
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -63,17 +62,19 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
return strconv.FormatUint(uint64(tgid), 10), nil
}
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+// +stateify savable
type threadSelfSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -83,16 +84,13 @@ type threadSelfSymlink struct {
var _ kernfs.Inode = (*threadSelfSymlink)(nil)
-func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
- inode := &threadSelfSymlink{pidns: pidns}
- inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
- d := &kernfs.Dentry{}
- d.Init(inode)
- return d
+func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+ inode := &threadSelfSymlink{pidns: i.pidns}
+ inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+ return inode
}
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -106,12 +104,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
@@ -119,16 +117,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
// dynamicBytesFileSetAttr implements a special file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type dynamicBytesFileSetAttr struct {
kernfs.DynamicBytesFile
}
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
}
// cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
type cpuStats struct {
// user is time spent in userspace tasks with non-positive niceness.
user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6dac2afa4..7c7afdcfa 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -25,94 +25,108 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
+type tcpMemDir int
+
+const (
+ tcpRMem tcpMemDir = iota
+ tcpWMem
+)
+
// newSysDir returns the dentry corresponding to /proc/sys directory.
-func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
- return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
- "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
- "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
- "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+ return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+ "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+ "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
+ "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
+ "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
+ "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
}),
- "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
- "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+ "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+ "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
+ "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
}),
- "net": fs.newSysNetDir(root, k),
+ "net": fs.newSysNetDir(ctx, root, k),
})
}
// newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
- var contents map[string]*kernfs.Dentry
+func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+ var contents map[string]kernfs.Inode
// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
// network namespace of the calling process.
if stack := k.RootNetworkNamespace().Stack(); stack != nil {
- contents = map[string]*kernfs.Dentry{
- "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+ contents = map[string]kernfs.Inode{
+ "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+ "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
+ "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
+ "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
+ "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+ "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
// value closest to the actual netstack behavior or any empty file, all
// of these files will have mode 0444 (read-only for all users).
- "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")),
- "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
- "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+ "ip_local_port_range": fs.newInode(ctx, root, 0444, newStaticFile("16000 65535")),
+ "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
+ "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")),
+ "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "ip_no_pmtu_disc": fs.newInode(ctx, root, 0444, newStaticFile("1")),
// tcp_allowed_congestion_control tell the user what they are able to
// do as an unprivledged process so we leave it empty.
- "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
- "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+ "tcp_allowed_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("")),
+ "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
+ "tcp_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
// Many of the following stub files are features netstack doesn't
// support. The unsupported features return "0" to indicate they are
// disabled.
- "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
- "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
- "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
- "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
- "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
- "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
- "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
- "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
- "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+ "tcp_base_mss": fs.newInode(ctx, root, 0444, newStaticFile("1280")),
+ "tcp_dsack": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_early_retrans": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_fack": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_fastopen": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_fastopen_key": fs.newInode(ctx, root, 0444, newStaticFile("")),
+ "tcp_invalid_ratelimit": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_keepalive_intvl": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_keepalive_probes": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_keepalive_time": fs.newInode(ctx, root, 0444, newStaticFile("7200")),
+ "tcp_mtu_probing": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_no_metrics_save": fs.newInode(ctx, root, 0444, newStaticFile("1")),
+ "tcp_probe_interval": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_probe_threshold": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "tcp_retries1": fs.newInode(ctx, root, 0444, newStaticFile("3")),
+ "tcp_retries2": fs.newInode(ctx, root, 0444, newStaticFile("15")),
+ "tcp_rfc1337": fs.newInode(ctx, root, 0444, newStaticFile("1")),
+ "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
+ "tcp_synack_retries": fs.newInode(ctx, root, 0444, newStaticFile("5")),
+ "tcp_syn_retries": fs.newInode(ctx, root, 0444, newStaticFile("3")),
+ "tcp_timestamps": fs.newInode(ctx, root, 0444, newStaticFile("1")),
}),
- "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
- "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
- "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
- "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
- "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
- "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
- "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+ "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+ "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
+ "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
+ "message_cost": fs.newInode(ctx, root, 0444, newStaticFile("5")),
+ "optmem_max": fs.newInode(ctx, root, 0444, newStaticFile("0")),
+ "rmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+ "rmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+ "somaxconn": fs.newInode(ctx, root, 0444, newStaticFile("128")),
+ "wmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+ "wmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")),
}),
}
}
- return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+ return fs.newStaticDir(ctx, root, contents)
}
// mmapMinAddrData implements vfs.DynamicBytesSource for
@@ -163,7 +177,7 @@ type tcpSackData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if d.enabled == nil {
sack, err := d.stack.TCPSACKEnabled()
@@ -180,10 +194,11 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Tough luck.
val = "1\n"
}
- buf.WriteString(val)
- return nil
+ _, err := buf.WriteString(val)
+ return err
}
+// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
if offset != 0 {
// No need to handle partial writes thus far.
@@ -199,7 +214,7 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
var v int32
n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
if err != nil {
- return n, err
+ return 0, err
}
if d.enabled == nil {
d.enabled = new(bool)
@@ -207,3 +222,198 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
*d.enabled = v != 0
return n, d.stack.SetTCPSACKEnabled(*d.enabled)
}
+
+// tcpRecoveryData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_recovery.
+//
+// +stateify savable
+type tcpRecoveryData struct {
+ kernfs.DynamicBytesFile
+
+ stack inet.Stack `state:"wait"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ recovery, err := d.stack.TCPRecovery()
+ if err != nil {
+ return err
+ }
+
+ _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
+ return err
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Limit the amount of memory allocated.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
+// tcpMemData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
+//
+// +stateify savable
+type tcpMemData struct {
+ kernfs.DynamicBytesFile
+
+ dir tcpMemDir
+ stack inet.Stack `state:"wait"`
+
+ // mu protects against concurrent reads/writes to FDs based on the dentry
+ // backing this byte source.
+ mu sync.Mutex `state:"nosave"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ size, err := d.readSizeLocked()
+ if err != nil {
+ return err
+ }
+ _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
+ return err
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ // Limit the amount of memory allocated.
+ src = src.TakeFirst(usermem.PageSize - 1)
+ size, err := d.readSizeLocked()
+ if err != nil {
+ return 0, err
+ }
+ buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
+ n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ newSize := inet.TCPBufferSize{
+ Min: int(buf[0]),
+ Default: int(buf[1]),
+ Max: int(buf[2]),
+ }
+ if err := d.writeSizeLocked(newSize); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
+ switch d.dir {
+ case tcpRMem:
+ return d.stack.TCPReceiveBufferSize()
+ case tcpWMem:
+ return d.stack.TCPSendBufferSize()
+ default:
+ panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+ }
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
+ switch d.dir {
+ case tcpRMem:
+ return d.stack.SetTCPReceiveBufferSize(size)
+ case tcpWMem:
+ return d.stack.SetTCPSendBufferSize(size)
+ default:
+ panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+ }
+}
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+ kernfs.DynamicBytesFile
+
+ stack inet.Stack `state:"wait"`
+ enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if ipf.enabled == nil {
+ enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+ ipf.enabled = &enabled
+ }
+
+ val := "0\n"
+ if *ipf.enabled {
+ // Technically, this is not quite compatible with Linux. Linux stores these
+ // as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+ // Tough luck.
+ val = "1\n"
+ }
+ buf.WriteString(val)
+
+ return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Limit input size so as not to impact performance if input size is large.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ if ipf.enabled == nil {
+ ipf.enabled = new(bool)
+ }
+ *ipf.enabled = v != 0
+ if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index be54897bb..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/usermem"
)
func newIPv6TestStack() *inet.TestStack {
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
t.Errorf("Got n.contents() = %v, want = %v", got, want)
}
}
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+ ctx := context.Background()
+ s := inet.NewTestStack()
+
+ var cases = []struct {
+ comment string
+ initial bool
+ str string
+ final bool
+ }{
+ {
+ comment: `Forwarding is disabled; write 1 and enable forwarding`,
+ initial: false,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is disabled; write 0 and disable forwarding`,
+ initial: false,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is enabled; write 1 and enable forwarding`,
+ initial: true,
+ str: "1",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 0 and disable forwarding`,
+ initial: true,
+ str: "0",
+ final: false,
+ },
+ {
+ comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+ initial: false,
+ str: "2404",
+ final: true,
+ },
+ {
+ comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+ initial: true,
+ str: "2404",
+ final: true,
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.comment, func(t *testing.T) {
+ s.IPForwarding = c.initial
+
+ file := &ipForwarding{stack: s, enabled: &c.initial}
+
+ // Write the values.
+ src := usermem.BytesIOSequence([]byte(c.str))
+ if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+ }
+
+ // Read the values from the stack and check them.
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 19abb5034..7ee6227a9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -67,6 +67,7 @@ var (
taskStaticFiles = map[string]testutil.DirentType{
"auxv": linux.DT_REG,
"cgroup": linux.DT_REG,
+ "cwd": linux.DT_LNK,
"cmdline": linux.DT_REG,
"comm": linux.DT_REG,
"environ": linux.DT_REG,
@@ -76,6 +77,7 @@ var (
"gid_map": linux.DT_REG,
"io": linux.DT_REG,
"maps": linux.DT_REG,
+ "mem": linux.DT_REG,
"mountinfo": linux.DT_REG,
"mounts": linux.DT_REG,
"net": linux.DT_DIR,
@@ -104,13 +106,16 @@ func setup(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("NewMountNamespace(): %v", err)
}
+ root := mntns.Root()
+ root.IncRef()
+ defer root.DecRef(ctx)
pop := &vfs.PathOperation{
- Root: mntns.Root(),
- Start: mntns.Root(),
+ Root: root,
+ Start: root,
Path: fspath.Parse("/proc"),
}
if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
@@ -118,8 +123,8 @@ func setup(t *testing.T) *testutil.System {
}
pop = &vfs.PathOperation{
- Root: mntns.Root(),
- Start: mntns.Root(),
+ Root: root,
+ Start: root,
Path: fspath.Parse("/proc"),
}
mntOpts := &vfs.MountOptions{
@@ -132,7 +137,7 @@ func setup(t *testing.T) *testutil.System {
},
},
}
- if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+ if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
t.Fatalf("MountAt(/proc): %v", err)
}
return testutil.NewSystem(ctx, t, k.VFS(), mntns)
@@ -218,7 +223,7 @@ func TestTasks(t *testing.T) {
if err != nil {
t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
buf := make([]byte, 1)
bufIOSeq := usermem.BytesIOSequence(buf)
if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
@@ -336,7 +341,7 @@ func TestTasksOffset(t *testing.T) {
if err != nil {
t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
}
@@ -441,7 +446,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F
t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err)
continue
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
stat, err := child.Stat(ctx, vfs.StatOptions{})
if err != nil {
t.Errorf("Stat(%v) failed: %v", absPath, err)
@@ -476,7 +481,7 @@ func TestTree(t *testing.T) {
if err != nil {
t.Fatalf("failed to create test file: %v", err)
}
- defer file.DecRef()
+ defer file.DecRef(s.Ctx)
var tasks []*kernel.Task
for i := 0; i < 5; i++ {
@@ -501,5 +506,5 @@ func TestTree(t *testing.T) {
t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err)
}
iterateDir(ctx, t, s, fd)
- fd.DecRef()
+ fd.DecRef(ctx)
}
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
index 067c1657f..adb610213 100644
--- a/pkg/sentry/fsimpl/signalfd/BUILD
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/sentry/kernel",
"//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index d29ef3f83..10f1452ef 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -16,7 +16,6 @@ package signalfd
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -26,11 +25,14 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// target is the original signal target task.
//
@@ -42,7 +44,7 @@ type SignalFileDescription struct {
target *kernel.Task
// mu protects mask.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// mask is the signal mask. Protected by mu.
mask linux.SignalSet
@@ -53,7 +55,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil)
// New creates a new signal fd.
func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[signalfd]")
- defer vd.DecRef()
+ defer vd.DecRef(target)
sfd := &SignalFileDescription{
target: target,
mask: mask,
@@ -82,7 +84,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
sfd.mask = mask
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
// Attempt to dequeue relevant signals.
info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -92,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
}
// Copy out the signal info using the specified format.
- var buf [128]byte
- binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+ infoNative := linux.SignalfdSiginfo{
Signo: uint32(info.Signo),
Errno: info.Errno,
Code: info.Code,
@@ -102,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
Status: info.Status(),
Overrun: uint32(info.Overrun()),
Addr: info.Addr(),
- })
- n, err := dst.CopyOut(ctx, buf[:])
- return int64(n), err
+ }
+ n, err := infoNative.WriteTo(dst.Writer(ctx))
+ if err == usermem.ErrEndOfIOSequence {
+ // Partial copy-out ok.
+ err = nil
+ }
+ return n, err
}
// Readiness implements waiter.Waitable.Readiness.
@@ -131,5 +136,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
sfd.target.SignalUnregister(entry)
}
-// Release implements FileDescriptionImpl.Release()
-func (sfd *SignalFileDescription) Release() {}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index ee0828a15..fda1fa942 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,14 +28,16 @@ import (
)
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("sockfs.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
//
// Note that registering sockfs is unnecessary, except for the fact that it
// will not show up under /proc/filesystems as a result. This is a very minor
@@ -44,6 +46,10 @@ func (filesystemType) Name() string {
return "sockfs"
}
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -67,9 +73,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -80,30 +86,37 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
- kernfs.InodeNotDirectory
- kernfs.InodeNotSymlink
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
}
// Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return nil, syserror.ENXIO
}
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
// NewDentry constructs and returns a sockfs dentry.
//
// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
-func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
+func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry {
fs := mnt.Filesystem().Impl().(*filesystem)
// File mode matches net/socket.c:sock_alloc.
filemode := linux.FileMode(linux.S_IFSOCK | 0600)
i := &inode{}
- i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
+ i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
d := &kernfs.Dentry{}
- d.Init(i)
+ d.Init(&fs.Filesystem, i)
return d.VFSDentry()
}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index a741e2bb6..09043b572 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,21 +1,42 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "dir_refs",
+ out = "dir_refs.go",
+ package = "sys",
+ prefix = "dir",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "dir",
+ },
+)
+
go_library(
name = "sys",
srcs = [
+ "dir_refs.go",
+ "kcov.go",
"sys.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/coverage",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
+ "//pkg/sentry/arch",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
"//pkg/sentry/vfs",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
@@ -29,6 +50,6 @@ go_test(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..b13f141a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,118 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+ k := &kcovInode{}
+ k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+ return k
+}
+
+// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
+type kcovInode struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+ implStatFS
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ panic("KernelFromContext returned nil")
+ }
+ fd := &kcovFD{
+ inode: i,
+ kcov: k.NewKcov(),
+ }
+
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// +stateify savable
+type kcovFD struct {
+ vfs.FileDescriptionDefaultImpl
+ vfs.NoLockFD
+
+ vfsfd vfs.FileDescription
+ inode *kcovInode
+ kcov *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ cmd := uint32(args[1].Int())
+ arg := args[2].Uint64()
+ switch uint32(cmd) {
+ case linux.KCOV_INIT_TRACE:
+ return 0, fd.kcov.InitTrace(arg)
+ case linux.KCOV_ENABLE:
+ return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+ case linux.KCOV_DISABLE:
+ if arg != 0 {
+ // This arg is unused; it should be 0.
+ return 0, syserror.EINVAL
+ }
+ return 0, fd.kcov.DisableTrace(ctx)
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+ // kcov instances have reference counts in Linux, but this seems sufficient
+ // for our purposes.
+ fd.kcov.Clear(ctx)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ creds := auth.CredentialsFromContext(ctx)
+ fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0af373604..7d2147141 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -18,9 +18,11 @@ package sys
import (
"bytes"
"fmt"
+ "strconv"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/coverage"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -28,13 +30,21 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-// Name is the default filesystem name.
-const Name = "sysfs"
+const (
+ // Name is the default filesystem name.
+ Name = "sysfs"
+ defaultSysDirMode = linux.FileMode(0755)
+ defaultMaxCachedDentries = uint64(1000)
+)
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -46,6 +56,9 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -53,97 +66,157 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, err
}
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ maxCachedDentries := defaultMaxCachedDentries
+ if str, ok := mopts["dentry_cache_limit"]; ok {
+ delete(mopts, "dentry_cache_limit")
+ maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+ return nil, nil, syserror.EINVAL
+ }
+ }
+
fs := &filesystem{
devMinor: devMinor,
}
+ fs.MaxCachedDentries = maxCachedDentries
fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
- k := kernel.KernelFromContext(ctx)
- maxCPUCores := k.ApplicationCores()
- defaultSysDirMode := linux.FileMode(0755)
- root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "block": fs.newDir(creds, defaultSysDirMode, nil),
- "bus": fs.newDir(creds, defaultSysDirMode, nil),
- "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "power_supply": fs.newDir(creds, defaultSysDirMode, nil),
+ root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+ "block": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "bus": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "class": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+ "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
}),
- "dev": fs.newDir(creds, defaultSysDirMode, nil),
- "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- }),
+ "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "devices": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+ "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+ "cpu": cpuDir(ctx, fs, creds),
}),
}),
- "firmware": fs.newDir(creds, defaultSysDirMode, nil),
- "fs": fs.newDir(creds, defaultSysDirMode, nil),
- "kernel": fs.newDir(creds, defaultSysDirMode, nil),
- "module": fs.newDir(creds, defaultSysDirMode, nil),
- "power": fs.newDir(creds, defaultSysDirMode, nil),
+ "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "fs": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "kernel": kernelDir(ctx, fs, creds),
+ "module": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "power": fs.newDir(ctx, creds, defaultSysDirMode, nil),
})
- return fs.VFSFilesystem(), root.VFSDentry(), nil
+ var rootD kernfs.Dentry
+ rootD.Init(&fs.Filesystem, root)
+ return fs.VFSFilesystem(), rootD.VFSDentry(), nil
+}
+
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
+ k := kernel.KernelFromContext(ctx)
+ maxCPUCores := k.ApplicationCores()
+ children := map[string]kernfs.Inode{
+ "online": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+ "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+ "present": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+ }
+ for i := uint(0); i < maxCPUCores; i++ {
+ children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
+ }
+ return fs.newDir(ctx, creds, defaultSysDirMode, children)
+}
+
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
+ // If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+ // should be mounted at debug/, but for our purposes, it is sufficient to
+ // keep it in sys.
+ var children map[string]kernfs.Inode
+ if coverage.KcovAvailable() {
+ children = map[string]kernfs.Inode{
+ "debug": fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
+ "kcov": fs.newKcovFile(ctx, creds),
+ }),
+ }
+ }
+ return fs.newDir(ctx, creds, defaultSysDirMode, children)
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// dir implements kernfs.Inode.
+//
+// +stateify savable
type dir struct {
+ dirRefs
+ kernfs.InodeAlwaysValid
kernfs.InodeAttrs
- kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
kernfs.InodeDirectoryNoNewChildren
-
+ kernfs.InodeTemporary
kernfs.OrderedChildren
- dentry kernfs.Dentry
+
+ locks vfs.FileLocks
}
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
d := &dir{}
- d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+ d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- d.dentry.Init(d)
-
- d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
-
- return &d.dentry
+ d.EnableLeakCheck()
+ d.IncLinks(d.OrderedChildren.Populate(contents))
+ return d
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
// Open implements kernfs.Inode.Open.
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(ctx context.Context) {
+ d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
// cpuFile implements kernfs.Inode.
+//
+// +stateify savable
type cpuFile struct {
+ implStatFS
kernfs.DynamicBytesFile
+
maxCores uint
}
// Generate implements vfs.DynamicBytesSource.Generate.
func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "0-%d", c.maxCores-1)
+ fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
return nil
}
-func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
c := &cpuFile{maxCores: maxCores}
- c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
- d := &kernfs.Dentry{}
- d.Init(c)
- return d
+ c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
+ return c
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 4b3602d47..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+ mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create new mount namespace: %v", err)
}
@@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) {
k := kernel.KernelFromContext(s.Ctx)
maxCPUCores := k.ApplicationCores()
- expected := fmt.Sprintf("0-%d", maxCPUCores-1)
+ expected := fmt.Sprintf("0-%d\n", maxCPUCores-1)
for _, fname := range []string{"online", "possible", "present"} {
pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
@@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
content, err := s.ReadToEnd(fd)
if err != nil {
t.Fatalf("Read failed: %v", err)
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 0e4053a46..400a97996 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -32,6 +32,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index c16a36cdb..738c0c9cc 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) {
return nil, fmt.Errorf("creating platform: %v", err)
}
+ kernel.VFS2Enabled = true
k := &kernel.Kernel{
Platform: plat,
}
@@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) {
k.SetMemoryFile(mf)
// Pass k as the platform since it is savable, unlike the actual platform.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
- kernel.VFS2Enabled = true
-
- if err := k.VFS().Init(); err != nil {
- return nil, fmt.Errorf("VFS init: %v", err)
- }
k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
@@ -126,12 +122,16 @@ func Boot() (*kernel.Kernel, error) {
// CreateTask creates a new bare bones task for tests.
func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ return nil, fmt.Errorf("cannot find kernel from context")
+ }
+
exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root)
if err != nil {
return nil, err
}
m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
- m.SetExecutable(fsbridge.NewVFSFile(exe))
+ m.SetExecutable(ctx, fsbridge.NewVFSFile(exe))
config := &kernel.TaskConfig{
Kernel: k,
@@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
FSContext: kernel.NewFSContextVFS2(root, cwd, 0022),
FDTable: k.NewFDTable(),
}
- return k.TaskSet().NewTask(config)
+ t, err := k.TaskSet().NewTask(ctx, config)
+ if err != nil {
+ config.ThreadGroup.Release(ctx)
+ return nil, err
+ }
+ return t, nil
}
func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 0556af877..1a8525b06 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -46,16 +46,18 @@ type System struct {
// NewSystem constructs a System.
//
-// Precondition: Caller must hold a reference on MntNs, whose ownership
+// Precondition: Caller must hold a reference on mns, whose ownership
// is transferred to the new System.
func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+ root := mns.Root()
+ root.IncRef()
s := &System{
t: t,
Ctx: ctx,
Creds: auth.CredentialsFromContext(ctx),
VFS: v,
MntNs: mns,
- Root: mns.Root(),
+ Root: root,
}
return s
}
@@ -97,8 +99,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
// Destroy release resources associated with a test system.
func (s *System) Destroy() {
- s.Root.DecRef()
- s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
+ s.Root.DecRef(s.Ctx)
+ s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem.
}
// ReadToEnd reads the contents of fd until EOF to a string.
@@ -149,7 +151,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
if err != nil {
s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
collector := &DirentCollector{}
if err := fd.IterDirents(s.Ctx, collector); err != nil {
@@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
defer d.mu.Unlock()
dirent, ok := d.dirents[name]
if !ok {
- return fmt.Errorf("No dirent named %q found", name)
+ return fmt.Errorf("no dirent named %q found", name)
}
if dirent.Type != typ {
- return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+ return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
}
return nil
}
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 60c92d626..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,12 +26,15 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
+//
+// +stateify savable
type TimerFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
events waiter.Queue
timer *ktime.Timer
@@ -46,9 +49,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil)
var _ ktime.TimerListener = (*TimerFileDescription)(nil)
// New returns a new timer fd.
-func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[timerfd]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
tfd := &TimerFileDescription{}
tfd.timer = ktime.NewTimer(clock, tfd)
if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
@@ -61,7 +64,7 @@ func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.F
return &tfd.vfsfd, nil
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
const sizeofUint64 = 8
if dst.NumBytes() < sizeofUint64 {
@@ -127,8 +130,8 @@ func (tfd *TimerFileDescription) ResumeTimer() {
tfd.timer.Resume()
}
-// Release implements FileDescriptionImpl.Release()
-func (tfd *TimerFileDescription) Release() {
+// Release implements vfs.FileDescriptionImpl.Release.
+func (tfd *TimerFileDescription) Release(context.Context) {
tfd.timer.Destroy()
}
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 062321cbc..fe520b6fd 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -26,6 +26,17 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "tmpfs",
+ prefix = "inode",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
go_library(
name = "tmpfs",
srcs = [
@@ -34,8 +45,10 @@ go_library(
"directory.go",
"filesystem.go",
"fstree.go",
+ "inode_refs.go",
"named_pipe.go",
"regular_file.go",
+ "save_restore.go",
"socket_file.go",
"symlink.go",
"tmpfs.go",
@@ -47,6 +60,8 @@ go_library(
"//pkg/context",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
@@ -62,7 +77,6 @@ go_library(
"//pkg/sentry/uniqueid",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
- "//pkg/sentry/vfs/lock",
"//pkg/sentry/vfs/memxattr",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 2fb5c4d84..3cc63e732 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent
}
err = fn(root, d)
- d.DecRef()
+ d.DecRef(ctx)
return err
}
@@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create mount namespace: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create nested directories with given depth.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
d := root
d.IncRef()
- defer d.DecRef()
+ defer d.DecRef(ctx)
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- d.DecRef()
+ d.DecRef(ctx)
d = next
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- file.DecRef()
+ file.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -176,24 +176,25 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
b.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create nested directories with given depth.
root := mntns.Root()
- defer root.DecRef()
+ root.IncRef()
+ defer root.DecRef(ctx)
vd := root
vd.IncRef()
for i := depth; i > 0; i-- {
@@ -212,7 +213,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- vd.DecRef()
+ vd.DecRef(ctx)
vd = nextVD
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -228,12 +229,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
Mode: 0644,
})
- vd.DecRef()
+ vd.DecRef(ctx)
vd = vfs.VirtualDentry{}
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -278,14 +279,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create mount namespace: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create and mount the submount.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
b.Fatalf("failed to create mount point: %v", err)
}
@@ -293,7 +294,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
if err != nil {
b.Fatalf("failed to create tmpfs submount: %v", err)
@@ -309,7 +310,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount root: %v", err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -319,7 +320,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- d.DecRef()
+ d.DecRef(ctx)
d = next
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -330,7 +331,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- file.DecRef()
+ file.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -370,24 +371,25 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
b.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create the mount point.
root := mntns.Root()
- defer root.DecRef()
+ root.IncRef()
+ defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
Start: root,
@@ -403,9 +405,9 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
// Create and mount the submount.
- if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+ if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
b.Fatalf("failed to mount tmpfs submount: %v", err)
}
filePathBuilder.WriteString(mountPointName)
@@ -432,7 +434,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- vd.DecRef()
+ vd.DecRef(ctx)
vd = nextVD
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -448,11 +450,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
Mode: 0644,
})
- vd.DecRef()
+ vd.DecRef(ctx)
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- fd.DecRef()
+ fd.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 83bf885ee..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
+// +stateify savable
type deviceFile struct {
inode inode
kind vfs.DeviceKind
@@ -29,7 +30,7 @@ type deviceFile struct {
minor uint32
}
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
file := &deviceFile{
kind: kind,
major: major,
@@ -43,7 +44,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
default:
panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
}
- file.inode.init(file, fs, creds, mode)
+ file.inode.init(file, fs, kuid, kgid, mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 70387cb9c..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// +stateify savable
type directory struct {
// Since directories can't be hard-linked, each directory can only be
// associated with a single dentry, which we can store in the directory
@@ -44,21 +45,22 @@ type directory struct {
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
// efficiently. childList is protected by iterMu.
- iterMu sync.Mutex
+ iterMu sync.Mutex `state:"nosave"`
childList dentryList
}
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
dir := &directory{}
- dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
+ dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
dir.dentry.inode = &dir.inode
dir.dentry.vfsd.Init(&dir.dentry)
return dir
}
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
func (dir *directory) insertChildLocked(child *dentry, name string) {
child.parent = &dir.dentry
child.name = name
@@ -79,9 +81,13 @@ func (dir *directory) removeChildLocked(child *dentry) {
dir.iterMu.Lock()
dir.childList.Remove(child)
dir.iterMu.Unlock()
- child.unlinked = true
}
+func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
+ return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
+}
+
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
@@ -92,7 +98,7 @@ type directoryFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
if fd.iter != nil {
dir := fd.inode().impl.(*directory)
dir.iterMu.Lock()
@@ -107,13 +113,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
+ defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
// fs.mu is required to read d.parent and dentry.name.
fs.mu.RLock()
defer fs.mu.RUnlock()
dir.iterMu.Lock()
defer dir.iterMu.Unlock()
- fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
fd.inode().touchAtime(fd.vfsfd.Mount())
if fd.off == 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 183eb975c..e39cd305b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -39,8 +38,10 @@ func (fs *filesystem) Sync(ctx context.Context) error {
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
dir, ok := d.inode.impl.(*directory)
if !ok {
return nil, syserror.ENOTDIR
@@ -55,13 +56,13 @@ afterSymlink:
return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
return d, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, err
}
rp.Advance()
@@ -74,12 +75,12 @@ afterSymlink:
if !ok {
return nil, syserror.ENOENT
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// Symlink traversal updates access time.
- atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
+ child.inode.touchAtime(rp.Mount())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
@@ -97,10 +98,12 @@ afterSymlink:
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
+func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
for !rp.Final() {
- next, err := stepLocked(rp, d)
+ next, err := stepLocked(ctx, rp, d)
if err != nil {
return nil, err
}
@@ -118,10 +121,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
//
// Preconditions: filesystem.mu must be locked.
-func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
d := rp.Start().Impl().(*dentry)
for !rp.Done() {
- next, err := stepLocked(rp, d)
+ next, err := stepLocked(ctx, rp, d)
if err != nil {
return nil, err
}
@@ -139,12 +142,13 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
// doCreateAt is loosely analogous to a conjunction of Linux's
// fs/namei.c:filename_create() and done_path_create().
//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -182,7 +186,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
if dir {
ev |= linux.IN_ISDIR
}
- parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent)
+ parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
parentDir.inode.touchCMtime()
return nil
}
@@ -191,7 +195,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return err
}
@@ -202,7 +206,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -222,7 +226,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return nil, err
}
@@ -232,35 +236,40 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
d := vd.Dentry().Impl().(*dentry)
- if d.inode.isDir() {
+ i := d.inode
+ if i.isDir() {
return syserror.EPERM
}
- if d.inode.nlink == 0 {
+ if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+ return err
+ }
+ if i.nlink == 0 {
return syserror.ENOENT
}
- if d.inode.nlink == maxLinks {
+ if i.nlink == maxLinks {
return syserror.EMLINK
}
- d.inode.incLinksLocked()
- d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent)
- parentDir.insertChildLocked(fs.newDentry(d.inode), name)
+ i.incLinksLocked()
+ i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
+ parentDir.insertChildLocked(fs.newDentry(i), name)
return nil
})
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
if parentDir.inode.nlink == maxLinks {
return syserror.EMLINK
}
parentDir.inode.incLinksLocked() // from child's ".."
- childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+ childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
parentDir.insertChildLocked(&childDir.dentry, name)
return nil
})
@@ -268,19 +277,20 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
var childInode *inode
switch opts.Mode.FileType() {
- case 0, linux.S_IFREG:
- childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+ case linux.S_IFREG:
+ childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
case linux.S_IFIFO:
- childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+ childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
case linux.S_IFBLK:
- childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+ childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
case linux.S_IFCHR:
- childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+ childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
case linux.S_IFSOCK:
- childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
+ childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
default:
return syserror.EINVAL
}
@@ -301,30 +311,43 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// don't need fs.mu for writing.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return nil, err
}
+ d.IncRef()
+ defer d.DecRef(ctx)
+ fs.mu.RUnlock()
return d.open(ctx, rp, &opts, false /* afterCreate */)
}
mustCreate := opts.Flags&linux.O_EXCL != 0
start := rp.Start().Impl().(*dentry)
fs.mu.Lock()
- defer fs.mu.Unlock()
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.mu.Unlock()
+ unlocked = true
+ }
+ }
+ defer unlock()
if rp.Done() {
- // Reject attempts to open directories with O_CREAT.
+ // Reject attempts to open mount root directory with O_CREAT.
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
if mustCreate {
return nil, syserror.EEXIST
}
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
return start.open(ctx, rp, &opts, false /* afterCreate */)
}
afterTrailingSymlink:
- parentDir, err := walkParentDirLocked(rp, start)
+ parentDir, err := walkParentDirLocked(ctx, rp, start)
if err != nil {
return nil, err
}
@@ -355,37 +378,46 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ creds := rp.Credentials()
+ child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
parentDir.insertChildLocked(child, name)
+ unlock()
fd, err := child.open(ctx, rp, &opts, true)
if err != nil {
return nil, err
}
- parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent)
+ parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
parentDir.inode.touchCMtime()
return fd, nil
}
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
// Is the file mounted over?
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
// Do we need to resolve a trailing symlink?
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// Symlink traversal updates access time.
- atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
+ child.inode.touchAtime(rp.Mount())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
start = &parentDir.dentry
goto afterTrailingSymlink
}
- // Open existing file.
- if mustCreate {
- return nil, syserror.EEXIST
+ if rp.MustBeDir() && !child.inode.isDir() {
+ return nil, syserror.ENOTDIR
}
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
return child.open(ctx, rp, &opts, false)
}
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if !afterCreate {
@@ -396,10 +428,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
switch impl := d.inode.impl.(type) {
case *regularFile:
var fd regularFileFD
- if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ fd.LockFD.Init(&d.inode.locks)
+ if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
return nil, err
}
- if opts.Flags&linux.O_TRUNC != 0 {
+ if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
if _, err := impl.truncate(0); err != nil {
return nil, err
}
@@ -411,15 +444,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
return nil, syserror.EISDIR
}
var fd directoryFD
- if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ fd.LockFD.Init(&d.inode.locks)
+ if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
case *symlink:
- // Can't open symlinks without O_PATH (which is unimplemented).
+ // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
return nil, syserror.ELOOP
case *namedPipe:
- return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
+ return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
case *deviceFile:
return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
case *socketFile:
@@ -433,7 +467,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
@@ -455,7 +489,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Resolve newParent first to verify that it's on this Mount.
fs.mu.Lock()
defer fs.mu.Unlock()
- newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -480,6 +514,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !ok {
return syserror.ENOENT
}
+ if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
+ return err
+ }
// Note that we don't need to call rp.CheckMount(), since if renamed is a
// mount point then we want to rename the mount point, not anything in the
// mounted filesystem.
@@ -540,7 +577,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var replacedVFSD *vfs.Dentry
if replaced != nil {
replacedVFSD = &replaced.vfsd
@@ -551,17 +588,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if replaced != nil {
newParentDir.removeChildLocked(replaced)
if replaced.inode.isDir() {
- newParentDir.inode.decLinksLocked() // from replaced's ".."
+ // Remove links for replaced/. and replaced/..
+ replaced.inode.decLinksLocked(ctx)
+ newParentDir.inode.decLinksLocked(ctx)
}
- replaced.inode.decLinksLocked()
+ replaced.inode.decLinksLocked(ctx)
}
oldParentDir.removeChildLocked(renamed)
newParentDir.insertChildLocked(renamed, newName)
- vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
oldParentDir.inode.touchCMtime()
if oldParentDir != newParentDir {
if renamed.inode.isDir() {
- oldParentDir.inode.decLinksLocked()
+ oldParentDir.inode.decLinksLocked(ctx)
newParentDir.inode.incLinksLocked()
}
newParentDir.inode.touchCMtime()
@@ -576,7 +615,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -594,6 +633,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if !ok {
return syserror.ENOENT
}
+ if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
childDir, ok := child.inode.impl.(*directory)
if !ok {
return syserror.ENOTDIR
@@ -608,17 +650,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
parentDir.removeChildLocked(child)
- parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent)
+ parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
// Remove links for child, child/., and child/..
- child.inode.decLinksLocked()
- child.inode.decLinksLocked()
- parentDir.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ child.inode.decLinksLocked(ctx)
+ child.inode.decLinksLocked(ctx)
+ parentDir.inode.decLinksLocked(ctx)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
parentDir.inode.touchCMtime()
return nil
}
@@ -626,17 +668,19 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return err
}
- if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil {
+ err = d.inode.setStat(ctx, rp.Credentials(), &opts)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
- d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
}
return nil
}
@@ -645,7 +689,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return linux.Statx{}, err
}
@@ -658,25 +702,17 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- if _, err := resolveLocked(rp); err != nil {
+ if _, err := resolveLocked(ctx, rp); err != nil {
return linux.Statfs{}, err
}
- statfs := linux.Statfs{
- Type: linux.TMPFS_MAGIC,
- BlockSize: usermem.PageSize,
- FragmentSize: usermem.PageSize,
- NameLength: linux.NAME_MAX,
- // TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
- Blocks: 0,
- BlocksFree: 0,
- }
- return statfs, nil
+ return globalStatfs, nil
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
- child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
+ child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
parentDir.insertChildLocked(child, name)
return nil
})
@@ -686,7 +722,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -701,6 +737,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if !ok {
return syserror.ENOENT
}
+ if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
if child.inode.isDir() {
return syserror.EISDIR
}
@@ -714,7 +753,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
@@ -722,20 +761,20 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
// Generate inotify events. Note that this must take place before the link
// count of the child is decremented, or else the watches may be dropped
// before these events are added.
- vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+ vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
parentDir.removeChildLocked(child)
- child.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ child.inode.decLinksLocked(ctx)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
parentDir.inode.touchCMtime()
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -744,63 +783,70 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
}
switch impl := d.inode.impl.(type) {
case *socketFile:
+ if impl.ep == nil {
+ return nil, syserror.ECONNREFUSED
+ }
return impl.ep, nil
default:
return nil, syserror.ECONNREFUSED
}
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
- return d.inode.listxattr(size)
+ return d.inode.listXattr(size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
- return d.inode.getxattr(rp.Credentials(), &opts)
+ return d.inode.getXattr(rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return err
}
- if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+ err = d.inode.setXattr(rp.Credentials(), &opts)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
- d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return err
}
- if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+ err = d.inode.removeXattr(rp.Credentials(), name)
+ fs.mu.RUnlock()
+ if err != nil {
return err
}
- d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
@@ -819,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
if d.parent == nil {
if d.name != "" {
- // This must be an anonymous memfd file.
+ // This file must have been created by
+ // newUnlinkedRegularFileDescription(). In Linux,
+ // mm/shmem.c:__shmem_file_setup() =>
+ // fs/file_table.c:alloc_file_pseudo() sets the created
+ // dentry's dentry_operations to anon_ops, for which d_dname ==
+ // simple_dname. fs/d_path.c:simple_dname() defines the
+ // dentry's pathname to be its name, prefixed with "/" and
+ // suffixed with " (deleted)".
b.PrependComponent("/" + d.name)
+ b.AppendString(" (deleted)")
return vfs.PrependPathSyntheticError{}
}
return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 8d77b3fa8..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type namedPipe struct {
inode inode
@@ -28,11 +29,11 @@ type namedPipe struct {
}
// Preconditions:
-// * fs.mu must be locked.
-// * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
- file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
+ file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
file.inode.nlink = 1 // Only the parent has a link.
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 1614f2c39..2f856ce36 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -32,7 +32,7 @@ const fileName = "mypipe"
func TestSeparateFDs(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the read side. This is done in a concurrently because opening
// One end the pipe blocks until the other end is opened.
@@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer wfd.DecRef()
+ defer wfd.DecRef(ctx)
rfd, ok := <-rfdchan
if !ok {
t.Fatalf("failed to open pipe for reading %q", fileName)
}
- defer rfd.DecRef()
+ defer rfd.DecRef(ctx)
const msg = "vamos azul"
checkEmpty(ctx, t, rfd)
@@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) {
func TestNonblockingRead(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the read side as nonblocking.
pop := vfs.PathOperation{
@@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
}
- defer rfd.DecRef()
+ defer rfd.DecRef(ctx)
// Open the write side.
openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
@@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer wfd.DecRef()
+ defer wfd.DecRef(ctx)
const msg = "geh blau"
checkEmpty(ctx, t, rfd)
@@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) {
func TestNonblockingWriteError(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the write side as nonblocking, which should return ENXIO.
pop := vfs.PathOperation{
@@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) {
func TestSingleFD(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the pipe as readable and writable.
pop := vfs.PathOperation{
@@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
const msg = "forza blu"
checkEmpty(ctx, t, fd)
@@ -152,19 +152,20 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
// Create the pipe.
root := mntns.Root()
+ root.IncRef()
pop := vfs.PathOperation{
Root: root,
Start: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index fee174375..98680fde9 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,11 +36,17 @@ import (
)
// regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
type regularFile struct {
inode inode
// memFile is a platform.File used to allocate pages to this regularFile.
- memFile *pgalloc.MemoryFile
+ memFile *pgalloc.MemoryFile `state:"nosave"`
+
+ // memoryUsageKind is the memory accounting category under which pages backing
+ // this regularFile's contents are accounted.
+ memoryUsageKind usage.MemoryKind
// mapsMu protects mappings.
mapsMu sync.Mutex `state:"nosave"`
@@ -63,7 +68,7 @@ type regularFile struct {
writableMappingPages uint64
// dataMu protects the fields below.
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into memFile that store
// the file's data.
@@ -85,16 +90,77 @@ type regularFile struct {
size uint64
}
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &regularFile{
- memFile: fs.memFile,
- seals: linux.F_SEAL_SEAL,
+ memFile: fs.mfp.MemoryFile(),
+ memoryUsageKind: usage.Tmpfs,
+ seals: linux.F_SEAL_SEAL,
}
- file.inode.init(file, fs, creds, linux.S_IFREG|mode)
+ file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+ fs, ok := mount.Filesystem().Impl().(*filesystem)
+ if !ok {
+ panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+ }
+
+ inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+ d := fs.newDentry(inode)
+ defer d.DecRef(ctx)
+ d.name = name
+
+ fd := &regularFileFD{}
+ fd.Init(&inode.locks)
+ flags := uint32(linux.O_RDWR)
+ if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+ // Compare mm/shmem.c:shmem_zero_setup().
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+ if err != nil {
+ return nil, err
+ }
+ rf := fd.inode().impl.(*regularFile)
+ rf.memoryUsageKind = usage.Anonymous
+ rf.size = size
+ return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+ if err != nil {
+ return nil, err
+ }
+ if allowSeals {
+ fd.inode().impl.(*regularFile).seals = 0
+ }
+ return &fd.vfsfd, nil
+}
+
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -227,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
optional.End = pgend
}
- cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
@@ -261,25 +327,50 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
return nil
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is accessed using atomic memory operations.
// offMu serializes operations that may mutate off.
off int64
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
+func (fd *regularFileFD) Release(context.Context) {
// noop
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ f := fd.inode().impl.(*regularFile)
+
+ f.inode.mu.Lock()
+ defer f.inode.mu.Unlock()
+ oldSize := f.size
+ size := offset + length
+ if oldSize >= size {
+ return nil
+ }
+ _, err := f.truncateLocked(size)
+ return err
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if offset < 0 {
return 0, syserror.EINVAL
}
+
+ // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+ // all state is in-memory.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
if dst.NumBytes() == 0 {
return 0, nil
}
@@ -302,40 +393,60 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+ // all state is in-memory.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
}
+
srclen := src.NumBytes()
if srclen == 0 {
- return 0, nil
+ return 0, offset, nil
}
f := fd.inode().impl.(*regularFile)
+ f.inode.mu.Lock()
+ defer f.inode.mu.Unlock()
+ // If the file is opened with O_APPEND, update offset to file size.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Locking f.inode.mu is sufficient for reading f.size.
+ offset = int64(f.size)
+ }
if end := offset + srclen; end < offset {
// Overflow.
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
- var err error
srclen, err = vfs.CheckLimit(ctx, offset, srclen)
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(srclen)
- f.inode.mu.Lock()
rw := getRegularFileReadWriter(f, offset)
n, err := src.CopyInTo(ctx, rw)
- fd.inode().touchCMtimeLocked()
- f.inode.mu.Unlock()
+ f.inode.touchCMtimeLocked()
putRegularFileReadWriter(rw)
- return n, err
+ return n, n + offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.offMu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.offMu.Unlock()
return n, err
}
@@ -361,33 +472,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
return offset, nil
}
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
- return nil
-}
-
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
-func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
- return fd.inode().lockBSD(uid, t, block)
-}
-
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
-func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
- fd.inode().unlockBSD(uid)
- return nil
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
- return fd.inode().lockPOSIX(uid, t, rng, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
- fd.inode().unlockPOSIX(uid, rng)
- return nil
-}
-
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
file := fd.inode().impl.(*regularFile)
@@ -559,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
case gap.Ok():
// Allocate memory for the write.
gapMR := gap.Range().Intersect(pgMR)
- fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
if err != nil {
retErr = err
goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 64e1c40ad..146c7fdfe 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -138,48 +138,37 @@ func TestLocks(t *testing.T) {
}
defer cleanup()
- var (
- uid1 lock.UniqueID
- uid2 lock.UniqueID
- // Non-blocking.
- block lock.Blocker
- )
-
- uid1 = 123
- uid2 = 456
-
- if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+ uid1 := 123
+ uid2 := 456
+ if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+ if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
}
if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
}
- if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+ if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- rng1 := lock.LockRange{0, 1}
- rng2 := lock.LockRange{1, 2}
-
- if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
}
- if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+ if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil {
t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
}
}
diff --git a/pkg/sentry/fsimpl/tmpfs/save_restore.go b/pkg/sentry/fsimpl/tmpfs/save_restore.go
new file mode 100644
index 000000000..b27f75cc2
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/save_restore.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+// afterLoad is called by stateify.
+func (rf *regularFile) afterLoad() {
+ rf.memFile = rf.inode.fs.mfp.MemoryFile()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 25c2321af..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,14 +21,16 @@ import (
)
// socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
type socketFile struct {
inode inode
ep transport.BoundEndpoint
}
-func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
file := &socketFile{ep: ep}
- file.inode.init(file, fs, creds, mode)
+ file.inode.init(file, fs, kuid, kgid, mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 47e075ed4..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,16 +19,17 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
}
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
link := &symlink{
target: target,
}
- link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
+ link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
link.inode.nlink = 1 // from parent directory
return &link.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index f0e098702..4ce859d57 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,6 +30,7 @@ package tmpfs
import (
"fmt"
"math"
+ "strconv"
"strings"
"sync/atomic"
@@ -40,7 +41,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/sentry/vfs/lock"
"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -51,14 +51,19 @@ import (
const Name = "tmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
- // memFile is used to allocate pages to for regular files.
- memFile *pgalloc.MemoryFile
+ // mfp is used to allocate memory that stores regular file contents. mfp is
+ // immutable.
+ mfp pgalloc.MemoryFileProvider
// clock is a realtime clock used to set timestamps in file operations.
clock time.Clock
@@ -67,9 +72,11 @@ type filesystem struct {
devMinor uint32
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
nextInoMinusOne uint64 // accessed using atomic memory operations
+
+ root *dentry
}
// Name implements vfs.FilesystemType.Name.
@@ -77,7 +84,12 @@ func (FilesystemType) Name() string {
return Name
}
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
// FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
type FilesystemOpts struct {
// RootFileType is the FileType of the filesystem root. Valid values
// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -95,8 +107,8 @@ type FilesystemOpts struct {
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
- if memFileProvider == nil {
+ mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+ if mfp == nil {
panic("MemoryFileProviderFromContext returned nil")
}
@@ -112,13 +124,65 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ rootMode := linux.FileMode(0777)
+ if rootFileType == linux.S_IFDIR {
+ rootMode = 01777
+ }
+ modeStr, ok := mopts["mode"]
+ if ok {
+ delete(mopts, "mode")
+ mode, err := strconv.ParseUint(modeStr, 8, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+ return nil, nil, syserror.EINVAL
+ }
+ rootMode = linux.FileMode(mode & 07777)
+ }
+ rootKUID := creds.EffectiveKUID
+ uidStr, ok := mopts["uid"]
+ if ok {
+ delete(mopts, "uid")
+ uid, err := strconv.ParseUint(uidStr, 10, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+ return nil, nil, syserror.EINVAL
+ }
+ kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+ if !kuid.Ok() {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+ return nil, nil, syserror.EINVAL
+ }
+ rootKUID = kuid
+ }
+ rootKGID := creds.EffectiveKGID
+ gidStr, ok := mopts["gid"]
+ if ok {
+ delete(mopts, "gid")
+ gid, err := strconv.ParseUint(gidStr, 10, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+ return nil, nil, syserror.EINVAL
+ }
+ kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+ if !kgid.Ok() {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+ return nil, nil, syserror.EINVAL
+ }
+ rootKGID = kgid
+ }
+ if len(mopts) != 0 {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
devMinor, err := vfsObj.GetAnonBlockDevMinor()
if err != nil {
return nil, nil, err
}
clock := time.RealtimeClockFromContext(ctx)
fs := filesystem{
- memFile: memFileProvider.MemoryFile(),
+ mfp: mfp,
clock: clock,
devMinor: devMinor,
}
@@ -127,15 +191,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var root *dentry
switch rootFileType {
case linux.S_IFREG:
- root = fs.newDentry(fs.newRegularFile(creds, 0777))
+ root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
case linux.S_IFLNK:
- root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
+ root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
case linux.S_IFDIR:
- root = &fs.newDirectory(creds, 01777).dentry
+ root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
default:
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
}
+ fs.root = root
return &fs.vfsfs, &root.vfsd, nil
}
@@ -145,11 +210,63 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+ fs.mu.Lock()
+ if fs.root.inode.isDir() {
+ fs.root.releaseChildrenLocked(ctx)
+ }
+ fs.mu.Unlock()
+}
+
+// releaseChildrenLocked is called on the mount point by filesystem.Release() to
+// destroy all objects in the mount. It performs a depth-first walk of the
+// filesystem and "unlinks" everything by decrementing link counts
+// appropriately. There should be no open file descriptors when this is called,
+// so each inode should only have one outstanding reference that is removed once
+// its link count hits zero.
+//
+// Note that we do not update filesystem state precisely while tearing down (for
+// instance, the child maps are ignored)--we only care to remove all remaining
+// references so that every filesystem object gets destroyed. Also note that we
+// do not need to trigger DecRef on the mount point itself or any child mount;
+// these are taken care of by the destructor of the enclosing MountNamespace.
+//
+// Precondition: filesystem.mu is held.
+func (d *dentry) releaseChildrenLocked(ctx context.Context) {
+ dir := d.inode.impl.(*directory)
+ for _, child := range dir.childMap {
+ if child.inode.isDir() {
+ child.releaseChildrenLocked(ctx)
+ child.inode.decLinksLocked(ctx) // link for child/.
+ dir.inode.decLinksLocked(ctx) // link for child/..
+ }
+ child.inode.decLinksLocked(ctx) // link for child
+ }
+}
+
+// immutable
+var globalStatfs = linux.Statfs{
+ Type: linux.TMPFS_MAGIC,
+ BlockSize: usermem.PageSize,
+ FragmentSize: usermem.PageSize,
+ NameLength: linux.NAME_MAX,
+
+ // tmpfs currently does not support configurable size limits. In Linux,
+ // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+ // statfs(2). However, many applications treat this as having a size limit
+ // of 0. To work around this, claim to have a very large but non-zero size,
+ // chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+ // applications may also handle incorrectly).
+ // TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+ Blocks: math.MaxInt64 / usermem.PageSize,
+ BlocksFree: math.MaxInt64 / usermem.PageSize,
+ BlocksAvailable: math.MaxInt64 / usermem.PageSize,
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -163,11 +280,6 @@ type dentry struct {
// filesystem.mu.
name string
- // unlinked indicates whether this dentry has been unlinked from its parent.
- // It is only set to true on an unlink operation, and never set from true to
- // false. unlinked is protected by filesystem.mu.
- unlinked bool
-
// dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
@@ -202,23 +314,27 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
- d.inode.decRef()
+func (d *dentry) DecRef(ctx context.Context) {
+ d.inode.decRef(ctx)
}
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
-func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
if d.inode.isDir() {
events |= linux.IN_ISDIR
}
+ // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+ // that d was deleted.
+ deleted := d.vfsd.IsDead()
+
+ d.inode.fs.mu.RLock()
// The ordering below is important, Linux always notifies the parent first.
if d.parent != nil {
- // Note that d.parent or d.name may be stale if there is a concurrent
- // rename operation. Inotify does not provide consistency guarantees.
- d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked)
+ d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
}
- d.inode.watches.Notify("", events, cookie, et)
+ d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
+ d.inode.fs.mu.RUnlock()
}
// Watches implements vfs.DentryImpl.Watches.
@@ -226,18 +342,20 @@ func (d *dentry) Watches() *vfs.Watches {
return &d.inode.watches
}
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {}
+
// inode represents a filesystem object.
+//
+// +stateify savable
type inode struct {
// fs is the owning filesystem. fs is immutable.
fs *filesystem
- // refs is a reference count. refs is accessed using atomic memory
- // operations.
- //
// A reference is held on all inodes as long as they are reachable in the
// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
// nlink reaches 0.
- refs int64
+ refs inodeRefs
// xattrs implements extended attributes.
//
@@ -246,20 +364,19 @@ type inode struct {
// Inode metadata. Writing multiple fields atomically requires holding
// mu, othewise atomic operations can be used.
- mu sync.Mutex
- mode uint32 // file type and mode
- nlink uint32 // protected by filesystem.mu instead of inode.mu
- uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
- gid uint32 // auth.KGID, but ...
- ino uint64 // immutable
+ mu sync.Mutex `state:"nosave"`
+ mode uint32 // file type and mode
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
// Linux's tmpfs has no concept of btime.
atime int64 // nanoseconds
ctime int64 // nanoseconds
mtime int64 // nanoseconds
- // Advisory file locks, which lock at the inode level.
- locks lock.FileLocks
+ locks vfs.FileLocks
// Inotify watches for this inode.
watches vfs.Watches
@@ -269,15 +386,14 @@ type inode struct {
const maxLinks = math.MaxUint32
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
if mode.FileType() == 0 {
panic("file type is required in FileMode")
}
i.fs = fs
- i.refs = 1
i.mode = uint32(mode)
- i.uid = uint32(creds.EffectiveKUID)
- i.gid = uint32(creds.EffectiveKGID)
+ i.uid = uint32(kuid)
+ i.gid = uint32(kgid)
i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
// Tmpfs creation sets atime, ctime, and mtime to current time.
now := fs.clock.Now().Nanoseconds()
@@ -285,14 +401,16 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
i.ctime = now
i.mtime = now
// i.nlink initialized by caller
- i.watches = vfs.Watches{}
i.impl = impl
+ i.refs.EnableLeakCheck()
}
// incLinksLocked increments i's link count.
//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
func (i *inode) incLinksLocked() {
if i.nlink == 0 {
panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -306,46 +424,36 @@ func (i *inode) incLinksLocked() {
// decLinksLocked decrements i's link count. If the link count reaches 0, we
// remove a reference on i as well.
//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-func (i *inode) decLinksLocked() {
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+func (i *inode) decLinksLocked(ctx context.Context) {
if i.nlink == 0 {
panic("tmpfs.inode.decLinksLocked() called with no existing links")
}
if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
- i.decRef()
+ i.decRef(ctx)
}
}
func (i *inode) incRef() {
- if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("tmpfs.inode.incRef() called without holding a reference")
- }
+ i.refs.IncRef()
}
func (i *inode) tryIncRef() bool {
- for {
- refs := atomic.LoadInt64(&i.refs)
- if refs == 0 {
- return false
- }
- if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
- return true
- }
- }
+ return i.refs.TryIncRef()
}
-func (i *inode) decRef() {
- if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
- i.watches.HandleDeletion()
+func (i *inode) decRef(ctx context.Context) {
+ i.refs.DecRef(func() {
+ i.watches.HandleDeletion(ctx)
if regFile, ok := i.impl.(*regularFile); ok {
// Release memory used by regFile to store data. Since regFile is
// no longer usable, we don't need to grab any locks or update any
// metadata.
regFile.data.DropAll(regFile.memFile)
}
- } else if refs < 0 {
- panic("tmpfs.inode.decRef() called without holding a reference")
- }
+ })
}
func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -400,7 +508,8 @@ func (i *inode) statTo(stat *linux.Statx) {
}
}
-func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
+ stat := &opts.Stat
if stat.Mask == 0 {
return nil
}
@@ -408,7 +517,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
return syserror.EPERM
}
mode := linux.FileMode(atomic.LoadUint32(&i.mode))
- if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
return err
}
i.mu.Lock()
@@ -486,44 +595,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
return nil
}
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
- switch i.impl.(type) {
- case *regularFile:
- return i.locks.LockBSD(uid, t, block)
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockBSD(uid fslock.UniqueID) error {
- switch i.impl.(type) {
- case *regularFile:
- i.locks.UnlockBSD(uid)
- return nil
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
- switch i.impl.(type) {
- case *regularFile:
- return i.locks.LockPOSIX(uid, t, rng, block)
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
- switch i.impl.(type) {
- case *regularFile:
- i.locks.UnlockPOSIX(uid, rng)
- return nil
- }
- return syserror.EBADF
-}
-
// allocatedBlocksForSize returns the number of 512B blocks needed to
// accommodate the given size in bytes, as appropriate for struct
// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -543,6 +614,8 @@ func (i *inode) direntType() uint8 {
return linux.DT_LNK
case *socketFile:
return linux.DT_SOCK
+ case *namedPipe:
+ return linux.DT_FIFO
case *deviceFile:
switch impl.kind {
case vfs.BlockDevice:
@@ -562,6 +635,9 @@ func (i *inode) isDir() bool {
}
func (i *inode) touchAtime(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime {
+ return
+ }
if err := mnt.CheckBeginWrite(); err != nil {
return
}
@@ -589,69 +665,63 @@ func (i *inode) touchCMtime() {
i.mu.Unlock()
}
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
func (i *inode) touchCMtimeLocked() {
now := i.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&i.mtime, now)
atomic.StoreInt64(&i.ctime, now)
}
-func (i *inode) listxattr(size uint64) ([]string, error) {
- return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+ return i.xattrs.ListXattr(size)
}
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
- if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+ if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
- }
- if !i.userXattrSupported() {
- return "", syserror.ENODATA
- }
- return i.xattrs.Getxattr(opts)
+ return i.xattrs.GetXattr(opts)
}
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
- if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+ if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !i.userXattrSupported() {
- return syserror.EPERM
- }
- return i.xattrs.Setxattr(opts)
+ return i.xattrs.SetXattr(opts)
}
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
- if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
+ if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return i.xattrs.RemoveXattr(name)
+}
+
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ // We currently only support extended attributes in the user.* and
+ // trusted.* namespaces. See b/148380782.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
return syserror.EOPNOTSUPP
}
- if !i.userXattrSupported() {
- return syserror.EPERM
+ mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
}
- return i.xattrs.Removexattr(name)
-}
-
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
- filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
- return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}
// fileDescription is embedded by tmpfs implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -677,77 +747,67 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
d := fd.dentry()
- if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil {
+ if err := d.inode.setStat(ctx, creds, &opts); err != nil {
return err
}
if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
- d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
}
return nil
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.inode().listxattr(size)
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ return globalStatfs, nil
+}
+
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.inode().listXattr(size)
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
d := fd.dentry()
- if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
return err
}
// Generate inotify events.
- d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
d := fd.dentry()
- if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
return err
}
// Generate inotify events.
- d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) {
- fs, ok := mount.Filesystem().Impl().(*filesystem)
- if !ok {
- panic("NewMemfd() called with non-tmpfs mount")
- }
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
- // S_IRWXUGO.
- mode := linux.FileMode(0777)
- inode := fs.newRegularFile(creds, mode)
- rf := inode.impl.(*regularFile)
- if allowSeals {
- rf.seals = 0
- }
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
- d := fs.newDentry(inode)
- defer d.DecRef()
- d.name = name
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
- // FMODE_READ | FMODE_WRITE.
- var fd regularFileFD
- flags := uint32(linux.O_RDWR)
- if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
- }
- return &fd.vfsfd, nil
+// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
+// filesystem state is in-memory.
+func (*fileDescription) Sync(context.Context) error {
+ return nil
}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index a240fb276..fc5323abc 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -34,21 +34,22 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
creds := auth.CredentialsFromContext(ctx)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
}
root := mntns.Root()
+ root.IncRef()
return vfsObj, root, func() {
- root.DecRef()
- mntns.DecRef()
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
}, nil
}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
new file mode 100644
index 000000000..e265be0ee
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -0,0 +1,51 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+ name = "verity",
+ srcs = [
+ "filesystem.go",
+ "save_restore.go",
+ "verity.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/marshal/primitive",
+ "//pkg/merkletree",
+ "//pkg/refsvfs2",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
+
+go_test(
+ name = "verity_test",
+ srcs = [
+ "verity_test.go",
+ ],
+ library = ":verity",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fsimpl/testutil",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
new file mode 100644
index 000000000..2f6050cfd
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -0,0 +1,1046 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // All files should be read-only.
+ return nil
+}
+
+var dentrySlicePool = sync.Pool{
+ New: func() interface{} {
+ ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+ return &ds
+ },
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+ if ds == nil {
+ ds = dentrySlicePool.Get().(*[]*dentry)
+ }
+ *ds = append(*ds, d)
+ return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+ // Allow dentries to be GC'd.
+ for i := range *ds {
+ (*ds)[i] = nil
+ }
+ *ds = (*ds)[:0]
+ dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ fs.renameMu.RUnlock()
+ if *ds == nil {
+ return
+ }
+ if len(**ds) != 0 {
+ fs.renameMu.Lock()
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ }
+ putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ if *ds == nil {
+ fs.renameMu.Unlock()
+ return
+ }
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// verifyChild verifies the hash of child against the already verified hash of
+// the parent to ensure the child is expected. verifyChild triggers a sentry
+// panic if unexpected modifications to the file system are detected. In
+// noCrashOnVerificationFailure mode it returns a syserror instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ // Get the path to the child dentry. This is only used to provide path
+ // information in failure case.
+ childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
+ // Read the offset of the child from the extended attributes of the
+ // corresponding Merkle tree file.
+ // This is the offset of the hash for child in its parent's Merkle tree
+ // file.
+ off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.lowerMerkleVD,
+ Start: child.lowerMerkleVD,
+ }, &vfs.GetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+ // The offset xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ offset, err := strconv.Atoi(off)
+ if err != nil {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+
+ // Open parent Merkle tree file to read and verify child's hash.
+ parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerMerkleVD,
+ Start: parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The parent Merkle tree file should have been created. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // dataSize is the size of raw data for the Merkle tree. For a file,
+ // dataSize is the size of the whole file. For a directory, dataSize is
+ // the size of all its children's hashes.
+ dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ parentSize, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ }
+
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: parentMerkleFD,
+ Ctx: ctx,
+ }
+
+ parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ }, &vfs.StatOptions{})
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // Since we are verifying against a directory Merkle tree, buf should
+ // contain the hash of the children in the parent Merkle tree when
+ // Verify returns with success.
+ var buf bytes.Buffer
+ if _, err := merkletree.Verify(&merkletree.VerifyParams{
+ Out: &buf,
+ File: &fdReader,
+ Tree: &fdReader,
+ Size: int64(parentSize),
+ Name: parent.name,
+ Mode: uint32(parentStat.Mode),
+ UID: parentStat.UID,
+ GID: parentStat.GID,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ ReadOffset: int64(offset),
+ ReadSize: int64(merkletree.DigestSize(linux.FS_VERITY_HASH_ALG_SHA256)),
+ Expected: parent.hash,
+ DataAndTreeInSameFile: true,
+ }); err != nil && err != io.EOF {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+ }
+
+ // Cache child hash when it's verified the first time.
+ if len(child.hash) == 0 {
+ child.hash = buf.Bytes()
+ }
+ return child, nil
+}
+
+// verifyStat verifies the stat against the verified hash. The mode/uid/gid of
+// the file is cached after verified.
+func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ // Get the path to the child dentry. This is only used to provide path
+ // information in failure case.
+ childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return err
+ }
+
+ fs.verityMu.RLock()
+ defer fs.verityMu.RUnlock()
+
+ fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err == syserror.ENOENT {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+
+ merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ if err == syserror.ENODATA {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return err
+ }
+
+ size, err := strconv.Atoi(merkleSize)
+ if err != nil {
+ return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ }
+
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: fd,
+ Ctx: ctx,
+ }
+
+ var buf bytes.Buffer
+ params := &merkletree.VerifyParams{
+ Out: &buf,
+ Tree: &fdReader,
+ Size: int64(size),
+ Name: d.name,
+ Mode: uint32(stat.Mode),
+ UID: stat.UID,
+ GID: stat.GID,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ ReadOffset: 0,
+ // Set read size to 0 so only the metadata is verified.
+ ReadSize: 0,
+ Expected: d.hash,
+ DataAndTreeInSameFile: false,
+ }
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
+ params.DataAndTreeInSameFile = true
+ }
+
+ if _, err := merkletree.Verify(params); err != nil && err != io.EOF {
+ return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err))
+ }
+ d.mode = uint32(stat.Mode)
+ d.uid = stat.UID
+ d.gid = stat.GID
+ d.size = uint32(size)
+ return nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if child, ok := parent.children[name]; ok {
+ // If verity is enabled on child, we should check again whether
+ // the file and the corresponding Merkle tree are as expected,
+ // in order to catch deletion/renaming after the last time it's
+ // accessed.
+ if child.verityEnabled() {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ // Get the path to the child dentry. This is only used
+ // to provide path information in failure case.
+ path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ childVD, err := parent.getLowerAt(ctx, vfsObj, name)
+ if err == syserror.ENOENT {
+ // The file was previously accessed. If the
+ // file does not exist now, it indicates an
+ // unexpected modification to the file system.
+ return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path))
+ }
+ if err != nil {
+ return nil, err
+ }
+ defer childVD.DecRef(ctx)
+
+ childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ // The Merkle tree file was previous accessed. If it
+ // does not exist now, it indicates an unexpected
+ // modification to the file system.
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ defer childMerkleVD.DecRef(ctx)
+ }
+
+ // If enabling verification on files/directories is not allowed
+ // during runtime, all cached children are already verified. If
+ // runtime enable is allowed and the parent directory is
+ // enabled, we should verify the child hash here because it may
+ // be cached before enabled.
+ if fs.allowRuntimeEnable {
+ if parent.verityEnabled() {
+ if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ return nil, err
+ }
+ }
+ if child.verityEnabled() {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.lowerVD,
+ Start: child.lowerVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ return nil, err
+ }
+ if err := fs.verifyStat(ctx, child, stat); err != nil {
+ return nil, err
+ }
+ }
+ }
+ return child, nil
+ }
+ child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+ if err != nil {
+ return nil, err
+ }
+ if parent.children == nil {
+ parent.children = make(map[string]*dentry)
+ }
+ parent.children[name] = child
+ // child's refcount is initially 0, so it may be dropped after traversal.
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ childVD, childErr := parent.getLowerAt(ctx, vfsObj, name)
+ // We will handle ENOENT separately, as it may indicate unexpected
+ // modifications to the file system, and may cause a sentry panic.
+ if childErr != nil && childErr != syserror.ENOENT {
+ return nil, childErr
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ if childErr == nil {
+ defer childVD.DecRef(ctx)
+ }
+
+ childMerkleVD, childMerkleErr := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ // We will handle ENOENT separately, as it may indicate unexpected
+ // modifications to the file system, and may cause a sentry panic.
+ if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+ return nil, childMerkleErr
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ if childMerkleErr == nil {
+ defer childMerkleVD.DecRef(ctx)
+ }
+
+ // Get the path to the parent dentry. This is only used to provide path
+ // information in failure case.
+ parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // TODO(b/166474175): Investigate all possible errors of childErr and
+ // childMerkleErr, and make sure we differentiate all errors that
+ // indicate unexpected modifications to the file system from the ones
+ // that are not harmful.
+ if childErr == syserror.ENOENT && childMerkleErr == nil {
+ // Failed to get child file/directory dentry. However the
+ // corresponding Merkle tree is found. This indicates an
+ // unexpected modification to the file system that
+ // removed/renamed the child.
+ return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+ } else if childErr == nil && childMerkleErr == syserror.ENOENT {
+ // If in allowRuntimeEnable mode, and the Merkle tree file is
+ // not created yet, we create an empty Merkle tree file, so that
+ // if the file is enabled through ioctl, we have the Merkle tree
+ // file open and ready to use.
+ // This may cause empty and unused Merkle tree files in
+ // allowRuntimeEnable mode, if they are never enabled. This
+ // does not affect verification, as we rely on cached hash to
+ // decide whether to perform verification, not the existence of
+ // the Merkle tree file. Also, those Merkle tree files are
+ // always hidden and cannot be accessed by verity fs users.
+ if fs.allowRuntimeEnable {
+ childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: fspath.Parse(merklePrefix + name),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ return nil, err
+ }
+ childMerkleFD.DecRef(ctx)
+ childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+ if err != nil {
+ return nil, err
+ }
+ } else {
+ // If runtime enable is not allowed. This indicates an
+ // unexpected modification to the file system that
+ // removed/renamed the Merkle tree file.
+ return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+ }
+ } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+ // Both the child and the corresponding Merkle tree are missing.
+ // This could be an unexpected modification or due to incorrect
+ // parameter.
+ // TODO(b/167752508): Investigate possible ways to differentiate
+ // cases that both files are deleted from cases that they never
+ // exist in the file system.
+ return nil, alertIntegrityViolation(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
+ }
+
+ mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ child := fs.newDentry()
+ child.lowerVD = childVD
+ child.lowerMerkleVD = childMerkleVD
+
+ // Increase the reference for both childVD and childMerkleVD as they are
+ // held by child. If this function fails and the child is destroyed, the
+ // references will be decreased in destroyLocked.
+ childVD.IncRef()
+ childMerkleVD.IncRef()
+
+ parent.IncRef()
+ child.parent = parent
+ child.name = name
+
+ child.mode = uint32(stat.Mode)
+ child.uid = stat.UID
+ child.gid = stat.GID
+
+ // Verify child hash. This should always be performed unless in
+ // allowRuntimeEnable mode and the parent directory hasn't been enabled
+ // yet.
+ if parent.verityEnabled() {
+ if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ }
+ if child.verityEnabled() {
+ if err := fs.verifyStat(ctx, child, stat); err != nil {
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ }
+
+ return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+ // Verity file system is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return syserror.EROFS
+ }
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Verity fs is read-only.
+ if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+ return nil, syserror.EROFS
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+ // Open existing child or follow symlink.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Users should not open the Merkle tree files. Those are for verity fs
+ // use only.
+ if strings.Contains(d.name, merklePrefix) {
+ return nil, syserror.EPERM
+ }
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+
+ // Verity fs is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EROFS
+ }
+
+ // Get the path to the target file. This is only used to provide path
+ // information in failure case.
+ path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // Open the file in the underlying file system.
+ lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, opts)
+
+ // The file should exist, as we succeeded in finding its dentry. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // lowerFD needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity FD is successfully created.
+ defer lowerFD.DecRef(ctx)
+
+ // Open the Merkle tree file corresponding to the current file/directory
+ // to be used later for verifying Read/Walk.
+ merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The Merkle tree file should exist, as we succeeded in finding its
+ // dentry. If it's missing, it indicates an unexpected modification to
+ // the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // merkleReader needs to be cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is successfully created.
+ defer merkleReader.DecRef(ctx)
+
+ lowerFlags := lowerFD.StatusFlags()
+ lowerFDOpts := lowerFD.Options()
+ var merkleWriter *vfs.FileDescription
+ var parentMerkleWriter *vfs.FileDescription
+
+ // Only open the Merkle tree files for write if in allowRuntimeEnable
+ // mode.
+ if d.fs.allowRuntimeEnable {
+ merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+ // merkleWriter is cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is created successfully.
+ defer merkleWriter.DecRef(ctx)
+
+ if d.parent != nil {
+ parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.parent.lowerMerkleVD,
+ Start: d.parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+ return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+ }
+ return nil, err
+ }
+ // parentMerkleWriter is cleaned up if any error occurs. IncRef
+ // will be called if a verity FD is created successfully.
+ defer parentMerkleWriter.DecRef(ctx)
+ }
+ }
+
+ fd := &fileDescription{
+ d: d,
+ lowerFD: lowerFD,
+ merkleReader: merkleReader,
+ merkleWriter: merkleWriter,
+ parentMerkleWriter: parentMerkleWriter,
+ isDir: d.isDir(),
+ }
+
+ if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+ return nil, err
+ }
+ lowerFD.IncRef()
+ merkleReader.IncRef()
+ if merkleWriter != nil {
+ merkleWriter.IncRef()
+ }
+ if parentMerkleWriter != nil {
+ parentMerkleWriter.IncRef()
+ }
+ return &fd.vfsfd, err
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ //TODO(b/162787271): Provide integrity check for ReadlinkAt.
+ return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ })
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
+// be verified.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ var stat linux.Statx
+ stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, &opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ if d.verityEnabled() {
+ if err := fs.verifyStat(ctx, d, stat); err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ // TODO(b/159261227): Implement StatFSAt.
+ return linux.Statfs{}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
+ return nil, err
+ }
+ return nil, syserror.ECONNREFUSED
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ lowerVD := d.lowerVD
+ return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, size)
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ lowerVD := d.lowerVD
+ return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &opts)
+}
+
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ // Verity file system is read-only.
+ return syserror.EROFS
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ mnt := vd.Mount()
+ d := vd.Dentry().Impl().(*dentry)
+ for {
+ if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+ return vfs.PrependPathAtVFSRootError{}
+ }
+ if &d.vfsd == mnt.Root() {
+ return nil
+ }
+ if d.parent == nil {
+ return vfs.PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
+}
diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go
new file mode 100644
index 000000000..4a161163c
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+ if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+ refsvfs2.Register(d, "verity.dentry")
+ }
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
new file mode 100644
index 000000000..92ca6ca6b
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -0,0 +1,880 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package verity provides a filesystem implementation that is a wrapper of
+// another file system.
+// The verity file system provides integrity check for the underlying file
+// system by providing verification for path traversals and each read.
+// The verity file system is read-only, except for one case: when
+// allowRuntimeEnable is true, additional Merkle files can be generated using
+// the FS_IOC_ENABLE_VERITY ioctl.
+package verity
+
+import (
+ "fmt"
+ "math"
+ "strconv"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Name is the default filesystem name.
+const Name = "verity"
+
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // creds is a copy of the filesystem's creator's credentials, which are
+ // used for accesses to the underlying file system. creds is immutable.
+ creds *auth.Credentials
+
+ // allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY
+ // to build Merkle trees in the verity file system is allowed. If this
+ // is false, no new Merkle trees can be built, and only the files that
+ // had Merkle trees before startup (e.g. from a host filesystem mounted
+ // with gofer fs) can be verified.
+ allowRuntimeEnable bool
+
+ // lowerMount is the underlying file system mount.
+ lowerMount *vfs.Mount
+
+ // rootDentry is the mount root Dentry for this file system, which
+ // stores the root hash of the whole file system in bytes.
+ rootDentry *dentry
+
+ // renameMu synchronizes renaming with non-renaming operations in order
+ // to ensure consistent lock ordering between dentry.dirMu in different
+ // dentries.
+ renameMu sync.RWMutex `state:"nosave"`
+
+ // verityMu synchronizes enabling verity files, protects files or
+ // directories from being enabled by different threads simultaneously.
+ // It also ensures that verity does not access files that are being
+ // enabled.
+ //
+ // Also, the directory Merkle trees depends on the generated trees of
+ // its children. So they shouldn't be enabled the same time. This lock
+ // is for the whole file system to ensure that no more than one file is
+ // enabled the same time.
+ verityMu sync.RWMutex
+}
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
+type InternalFilesystemOptions struct {
+ // RootMerkleFileName is the name of the verity root Merkle tree file.
+ RootMerkleFileName string
+
+ // LowerName is the name of the filesystem wrapped by verity fs.
+ LowerName string
+
+ // RootHash is the root hash of the overall verity file system.
+ RootHash []byte
+
+ // AllowRuntimeEnable specifies whether the verity file system allows
+ // enabling verification for files (i.e. building Merkle trees) during
+ // runtime.
+ AllowRuntimeEnable bool
+
+ // LowerGetFSOptions is the file system option for the lower layer file
+ // system wrapped by verity file system.
+ LowerGetFSOptions vfs.GetFilesystemOptions
+
+ // NoCrashOnVerificationFailure indicates whether the sandbox should
+ // panic whenever verification fails. If true, an error is returned
+ // instead of panicking. This should only be set for tests.
+ NoCrashOnVerificationFailure bool
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns EIO, otherwise it panic.
+func alertIntegrityViolation(msg string) error {
+ if noCrashOnVerificationFailure {
+ return syserror.EIO
+ }
+ panic(msg)
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+ if !ok {
+ ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+ return nil, nil, syserror.EINVAL
+ }
+ noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+ // Mount the lower file system. The lower file system is wrapped inside
+ // verity, and should not be exposed or connected.
+ mopts := &vfs.MountOptions{
+ GetFilesystemOptions: iopts.LowerGetFSOptions,
+ InternalMount: true,
+ }
+ mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ fs := &filesystem{
+ creds: creds.Fork(),
+ lowerMount: mnt,
+ allowRuntimeEnable: iopts.AllowRuntimeEnable,
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ d := fs.newDentry()
+ d.refs = 1
+ lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+ lowerVD.IncRef()
+ d.lowerVD = lowerVD
+
+ rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+ lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+
+ // If runtime enable is allowed, the root merkle tree may be absent. We
+ // should create the tree file.
+ if err == syserror.ENOENT && fs.allowRuntimeEnable {
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ lowerMerkleFD.DecRef(ctx)
+ lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ } else if err != nil {
+ // Failed to get dentry for the root Merkle file. This
+ // indicates an unexpected modification that removed/renamed
+ // the root Merkle file, or it's never generated.
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, alertIntegrityViolation("Failed to find root Merkle file")
+ }
+ d.lowerMerkleVD = lowerMerkleVD
+
+ // Get metadata from the underlying file system.
+ const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.StatOptions{
+ Mask: statMask,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+
+ d.mode = uint32(stat.Mode)
+ d.uid = stat.UID
+ d.gid = stat.GID
+ d.hash = make([]byte, len(iopts.RootHash))
+
+ if !fs.allowRuntimeEnable {
+ if err := fs.verifyStat(ctx, d, stat); err != nil {
+ return nil, nil, err
+ }
+ }
+
+ copy(d.hash, iopts.RootHash)
+ d.vfsd.Init(d)
+
+ fs.rootDentry = d
+
+ return &fs.vfsfs, &d.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ fs.lowerMount.DecRef(ctx)
+}
+
+// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
+type dentry struct {
+ vfsd vfs.Dentry
+
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // mode, uid, gid and size are the file mode, owner, group, and size of
+ // the file in the underlying file system.
+ mode uint32
+ uid uint32
+ gid uint32
+ size uint32
+
+ // parent is the dentry corresponding to this dentry's parent directory.
+ // name is this dentry's name in parent. If this dentry is a filesystem
+ // root, parent is nil and name is the empty string. parent and name are
+ // protected by fs.renameMu.
+ parent *dentry
+ name string
+
+ // If this dentry represents a directory, children maps the names of
+ // children for which dentries have been instantiated to those dentries,
+ // and dirents (if not nil) is a cache of dirents as returned by
+ // directoryFDs representing this directory. children is protected by
+ // dirMu.
+ dirMu sync.Mutex `state:"nosave"`
+ children map[string]*dentry
+
+ // lowerVD is the VirtualDentry in the underlying file system.
+ lowerVD vfs.VirtualDentry
+
+ // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
+ // in the underlying file system.
+ lowerMerkleVD vfs.VirtualDentry
+
+ // hash is the calculated hash for the current file or directory.
+ hash []byte
+}
+
+// newDentry creates a new dentry representing the given verity file. The
+// dentry initially has no references; it is the caller's responsibility to set
+// the dentry's reference count and/or call dentry.destroy() as appropriate.
+// The dentry is initially invalid in that it contains no underlying dentry;
+// the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+ d := &dentry{
+ fs: fs,
+ }
+ d.vfsd.Init(d)
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Register(d, "verity.dentry")
+ }
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&d.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+ if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ } else if refs < 0 {
+ panic("verity.dentry.DecRef() called without holding a reference")
+ }
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+ // Dentries with a positive reference count must be retained. Dentries
+ // with a negative reference count have already been destroyed.
+ if atomic.LoadInt64(&d.refs) != 0 {
+ return
+ }
+ // Refs is still zero; destroy it.
+ d.destroyLocked(ctx)
+ return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("verity.dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("verity.dentry.destroyLocked() called with references on the dentry")
+ }
+
+ if d.lowerVD.Ok() {
+ d.lowerVD.DecRef(ctx)
+ }
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Unregister(d, "verity.dentry")
+ }
+
+ if d.lowerMerkleVD.Ok() {
+ d.lowerMerkleVD.DecRef(ctx)
+ }
+
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ if !d.vfsd.IsDead() {
+ delete(d.parent.children, d.name)
+ }
+ d.parent.dirMu.Unlock()
+ if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+ d.parent.checkDropLocked(ctx)
+ } else if refs < 0 {
+ panic("verity.dentry.DecRef() called without holding a reference")
+ }
+ }
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+ return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ //TODO(b/159261227): Implement InotifyWithParent.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ //TODO(b/159261227): Implement Watches.
+ return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {
+ //TODO(b/159261227): Implement OnZeroWatches.
+}
+
+func (d *dentry) isSymlink() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) isDir() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+ return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
+// getLowerAt returns the dentry in the underlying file system, which is
+// represented by filename relative to d.
+func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) {
+ return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ Path: fspath.Parse(filename),
+ }, &vfs.GetDentryOptions{})
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+ return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ })
+}
+
+// FileDescription implements vfs.FileDescriptionImpl for verity fds.
+// FileDescription is a wrapper of the underlying lowerFD, with support to build
+// Merkle trees through the Linux fs-verity API to verify contents read from
+// lowerFD.
+//
+// +stateify savable
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ // d is the corresponding dentry to the fileDescription.
+ d *dentry
+
+ // isDir specifies whehter the fileDescription points to a directory.
+ isDir bool
+
+ // lowerFD is the FileDescription corresponding to the file in the
+ // underlying file system.
+ lowerFD *vfs.FileDescription
+
+ // merkleReader is the read-only FileDescription corresponding to the
+ // Merkle tree file in the underlying file system.
+ merkleReader *vfs.FileDescription
+
+ // merkleWriter is the FileDescription corresponding to the Merkle tree
+ // file in the underlying file system for writing. This should only be
+ // used when allowRuntimeEnable is set to true.
+ merkleWriter *vfs.FileDescription
+
+ // parentMerkleWriter is the FileDescription of the Merkle tree for the
+ // directory that contains the current file/directory. This is only used
+ // if allowRuntimeEnable is set to true.
+ parentMerkleWriter *vfs.FileDescription
+
+ // off is the file offset. off is protected by mu.
+ mu sync.Mutex `state:"nosave"`
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+ fd.lowerFD.DecRef(ctx)
+ fd.merkleReader.DecRef(ctx)
+ if fd.merkleWriter != nil {
+ fd.merkleWriter.DecRef(ctx)
+ }
+ if fd.parentMerkleWriter != nil {
+ fd.parentMerkleWriter.DecRef(ctx)
+ }
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ // TODO(b/162788573): Add integrity check for metadata.
+ stat, err := fd.lowerFD.Stat(ctx, opts)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ if fd.d.verityEnabled() {
+ if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ // Verity files are read-only.
+ return syserror.EPERM
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ n := int64(0)
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ n = fd.off
+ case linux.SEEK_END:
+ n = int64(fd.d.size)
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset > math.MaxInt64-n {
+ return 0, syserror.EINVAL
+ }
+ offset += n
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
+// of the generated Merkle tree and the data size is returned. If fd points to
+// a regular file, the data is the content of the file. If fd points to a
+// directory, the data is all hahes of its children, written to the Merkle tree
+// file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+ merkleWriter := vfs.FileReadWriteSeeker{
+ FD: fd.merkleWriter,
+ Ctx: ctx,
+ }
+ params := &merkletree.GenerateParams{
+ TreeReader: &merkleReader,
+ TreeWriter: &merkleWriter,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ }
+
+ switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+ case linux.S_IFREG:
+ // For a regular file, generate a Merkle tree based on its
+ // content.
+ var err error
+ stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params.File = &fdReader
+ params.Size = int64(stat.Size)
+ params.Name = fd.d.name
+ params.Mode = uint32(stat.Mode)
+ params.UID = stat.UID
+ params.GID = stat.GID
+ params.DataAndTreeInSameFile = false
+ case linux.S_IFDIR:
+ // For a directory, generate a Merkle tree based on the hashes
+ // of its children that has already been written to the Merkle
+ // tree file.
+ merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params.Size = int64(merkleStat.Size)
+
+ stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ params.File = &merkleReader
+ params.Name = fd.d.name
+ params.Mode = uint32(stat.Mode)
+ params.UID = stat.UID
+ params.GID = stat.GID
+ params.DataAndTreeInSameFile = true
+ default:
+ // TODO(b/167728857): Investigate whether and how we should
+ // enable other types of file.
+ return nil, 0, syserror.EINVAL
+ }
+ hash, err := merkletree.Generate(params)
+ return hash, uint64(params.Size), err
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
+ if !fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.EPERM
+ }
+
+ fd.d.fs.verityMu.Lock()
+ defer fd.d.fs.verityMu.Unlock()
+
+ // In allowRuntimeEnable mode, the underlying fd and read/write fd for
+ // the Merkle tree file should have all been initialized. For any file
+ // or directory other than the root, the parent Merkle tree file should
+ // have also been initialized.
+ if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+ return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
+ }
+
+ hash, dataSize, err := fd.generateMerkle(ctx)
+ if err != nil {
+ return 0, err
+ }
+
+ if fd.parentMerkleWriter != nil {
+ stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return 0, err
+ }
+
+ // Write the hash of fd to the parent directory's Merkle tree
+ // file, as it should be part of the parent Merkle tree data.
+ // parentMerkleWriter is open with O_APPEND, so it should write
+ // directly to the end of the file.
+ if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
+ return 0, err
+ }
+
+ // Record the offset of the hash of fd in parent directory's
+ // Merkle tree file.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Value: strconv.Itoa(int(stat.Size)),
+ }); err != nil {
+ return 0, err
+ }
+ }
+
+ // Record the size of the data being hashed for fd.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleSizeXattr,
+ Value: strconv.Itoa(int(dataSize)),
+ }); err != nil {
+ return 0, err
+ }
+ fd.d.hash = append(fd.d.hash, hash...)
+ return 0, nil
+}
+
+// measureVerity returns the hash of fd, saved in verityDigest.
+func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ return 0, syserror.EINVAL
+ }
+ var metadata linux.DigestMetadata
+
+ // If allowRuntimeEnable is true, an empty fd.d.hash indicates that
+ // verity is not enabled for the file. If allowRuntimeEnable is false,
+ // this is an integrity violation because all files should have verity
+ // enabled, in which case fd.d.hash should be set.
+ if len(fd.d.hash) == 0 {
+ if fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.ENODATA
+ }
+ return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found")
+ }
+
+ // The first part of VerityDigest is the metadata.
+ if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+ return 0, err
+ }
+ if metadata.DigestSize < uint16(len(fd.d.hash)) {
+ return 0, syserror.EOVERFLOW
+ }
+
+ // Populate the output digest size, since DigestSize is both input and
+ // output.
+ metadata.DigestSize = uint16(len(fd.d.hash))
+
+ // First copy the metadata.
+ if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+ return 0, err
+ }
+
+ // Now copy the root hash bytes to the memory after metadata.
+ _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
+ return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
+ f := int32(0)
+
+ // All enabled files should store a hash. This flag is not settable via
+ // FS_IOC_SETFLAGS.
+ if len(fd.d.hash) != 0 {
+ f |= linux.FS_VERITY_FL
+ }
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ return 0, syserror.EINVAL
+ }
+ _, err := primitive.CopyInt32Out(t, flags, f)
+ return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FS_IOC_ENABLE_VERITY:
+ return fd.enableVerity(ctx, uio)
+ case linux.FS_IOC_MEASURE_VERITY:
+ return fd.measureVerity(ctx, uio, args[2].Pointer())
+ case linux.FS_IOC_GETFLAGS:
+ return fd.verityFlags(ctx, uio, args[2].Pointer())
+ default:
+ // TODO(b/169682228): Investigate which ioctl commands should
+ // be allowed.
+ return 0, syserror.ENOSYS
+ }
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Implement Read with PRead by setting offset.
+ fd.mu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // No need to verify if the file is not enabled yet in
+ // allowRuntimeEnable mode.
+ if !fd.d.verityEnabled() {
+ return fd.lowerFD.PRead(ctx, dst, offset, opts)
+ }
+
+ fd.d.fs.verityMu.RLock()
+ defer fd.d.fs.verityMu.RUnlock()
+ // dataSize is the size of the whole file.
+ dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the xattr does not exist, it
+ // indicates unexpected modifications to the file system.
+ if err == syserror.ENODATA {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ size, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ }
+
+ dataReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+
+ n, err := merkletree.Verify(&merkletree.VerifyParams{
+ Out: dst.Writer(ctx),
+ File: &dataReader,
+ Tree: &merkleReader,
+ Size: int64(size),
+ Name: fd.d.name,
+ Mode: fd.d.mode,
+ UID: fd.d.uid,
+ GID: fd.d.gid,
+ //TODO(b/156980949): Support passing other hash algorithms.
+ HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ ReadOffset: offset,
+ ReadSize: dst.NumBytes(),
+ Expected: fd.d.hash,
+ DataAndTreeInSameFile: false,
+ })
+ if err != nil {
+ return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+ }
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EROFS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EROFS
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
new file mode 100644
index 000000000..c647cbfd3
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -0,0 +1,700 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+ "fmt"
+ "io"
+ "math/rand"
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rootMerkleFilename is the name of the root Merkle tree file.
+const rootMerkleFilename = "root.verity"
+
+// maxDataSize is the maximum data size written to the file for test.
+const maxDataSize = 100000
+
+// newVerityRoot creates a new verity mount, and returns the root. The
+// underlying file system is tmpfs. If the error is not nil, then cleanup
+// should be called when the root is no longer needed.
+func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
+ k, err := testutil.Boot()
+ if err != nil {
+ t.Fatalf("testutil.Boot: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+
+ rand.Seed(time.Now().UnixNano())
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(ctx); err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+ }
+
+ vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+
+ mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: InternalFilesystemOptions{
+ RootMerkleFileName: rootMerkleFilename,
+ LowerName: "tmpfs",
+ AllowRuntimeEnable: true,
+ NoCrashOnVerificationFailure: true,
+ },
+ },
+ })
+ if err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("NewMountNamespace: %v", err)
+ }
+ root := mntns.Root()
+ root.IncRef()
+
+ // Use lowerRoot in the task as we modify the lower file system
+ // directly in many tests.
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ task, err := testutil.CreateTask(ctx, "name", tc, mntns, lowerRoot, lowerRoot)
+ if err != nil {
+ t.Fatalf("testutil.CreateTask: %v", err)
+ }
+
+ t.Helper()
+ t.Cleanup(func() {
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
+ })
+ return vfsObj, root, task, nil
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+ creds := auth.CredentialsFromContext(ctx)
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+
+ // Create the file in the underlying file system.
+ lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(filePath),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.ModeRegular | mode,
+ })
+ if err != nil {
+ return nil, 0, err
+ }
+
+ // Generate random data to be written to the file.
+ dataSize := rand.Intn(maxDataSize) + 1
+ data := make([]byte, dataSize)
+ rand.Read(data)
+
+ // Write directly to the underlying FD, since verity FD is read-only.
+ n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+
+ if n != int64(len(data)) {
+ return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data))
+ }
+
+ lowerFD.DecRef(ctx)
+
+ // Now open the verity file descriptor.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filePath),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular | mode,
+ })
+ return fd, dataSize, err
+}
+
+// corruptRandomBit randomly flips a bit in the file represented by fd.
+func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
+ // Flip a random bit in the underlying file.
+ randomPos := int64(rand.Intn(size))
+ byteToModify := make([]byte, 1)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
+ return fmt.Errorf("lowerFD.PRead: %v", err)
+ }
+ byteToModify[0] ^= 1
+ if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil {
+ return fmt.Errorf("lowerFD.PWrite: %v", err)
+ }
+ return nil
+}
+
+// TestOpen ensures that when a file is created, the corresponding Merkle tree
+// file and the root Merkle tree file exist.
+func TestOpen(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Ensure that the corresponding Merkle tree file is created.
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
+ }
+
+ // Ensure the root merkle tree file is created.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + rootMerkleFilename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+ }
+}
+
+// TestPReadUnmodifiedFileSucceeds ensures that pread from an untouched verity
+// file succeeds after enabling verity for it.
+func TestPReadUnmodifiedFileSucceeds(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirm a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ buf := make([]byte, size)
+ n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.PRead: %v", err)
+ }
+
+ if n != int64(size) {
+ t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ }
+}
+
+// TestReadUnmodifiedFileSucceeds ensures that read from an untouched verity
+// file succeeds after enabling verity for it.
+func TestReadUnmodifiedFileSucceeds(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirm a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ buf := make([]byte, size)
+ n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.Read: %v", err)
+ }
+
+ if n != int64(size) {
+ t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ }
+}
+
+// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file
+// succeeds after enabling verity for it.
+func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirms a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Ensure reopening the verity enabled file succeeds.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err != nil {
+ t.Errorf("reopen enabled file failed: %v", err)
+ }
+}
+
+// TestPReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestPReadModifiedFileFails(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerFD that's read/writable.
+ lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+ lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ // Confirm that read from the modified file fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ t.Fatalf("fd.PRead succeeded, expected failure")
+ }
+}
+
+// TestReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestReadModifiedFileFails(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerFD that's read/writable.
+ lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+ lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ // Confirm that read from the modified file fails.
+ buf := make([]byte, size)
+ if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil {
+ t.Fatalf("fd.Read succeeded, expected failure")
+ }
+}
+
+// TestModifiedMerkleFails ensures that read from a verity file fails if the
+// corresponding Merkle tree file is modified.
+func TestModifiedMerkleFails(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerMerkleFD that's read/writable.
+ lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ // Flip a random bit in the Merkle tree file.
+ stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("stat: %v", err)
+ }
+ merkleSize := int(stat.Size)
+ if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ // Confirm that read from a file with modified Merkle tree fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ fmt.Println(buf)
+ t.Fatalf("fd.PRead succeeded with modified Merkle file")
+ }
+}
+
+// TestModifiedParentMerkleFails ensures that open a verity enabled file in a
+// verity enabled directory fails if the hashes related to the target file in
+// the parent Merkle tree file is modified.
+func TestModifiedParentMerkleFails(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Enable verity on the parent directory.
+ parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerMerkleFD that's read/writable.
+ parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
+
+ parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: parentLowerMerkleVD,
+ Start: parentLowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ // Flip a random bit in the parent Merkle tree file.
+ // This parent directory contains only one child, so any random
+ // modification in the parent Merkle tree should cause verification
+ // failure when opening the child file.
+ stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("stat: %v", err)
+ }
+ parentMerkleSize := int(stat.Size)
+ if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ parentLowerMerkleFD.DecRef(ctx)
+
+ // Ensure reopening the verity enabled file fails.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err == nil {
+ t.Errorf("OpenAt file with modified parent Merkle succeeded")
+ }
+}
+
+// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file
+// succeeds after enabling verity for it.
+func TestUnmodifiedStatSucceeds(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirms stat succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("fd.Ioctl: %v", err)
+ }
+
+ if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
+ t.Errorf("fd.Stat: %v", err)
+ }
+}
+
+// TestModifiedStatFails checks that getting stat for a file with modified stat
+// should fail.
+func TestModifiedStatFails(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("fd.Ioctl: %v", err)
+ }
+
+ lowerFD := fd.Impl().(*fileDescription).lowerFD
+ // Change the stat of the underlying file, and check that stat fails.
+ if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: uint32(linux.STATX_MODE),
+ Mode: 0777,
+ },
+ }); err != nil {
+ t.Fatalf("lowerFD.SetStat: %v", err)
+ }
+
+ if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
+ t.Errorf("fd.Stat succeeded when it should fail")
+ }
+}
+
+// TestOpenDeletedOrRenamedFileFails ensures that opening a deleted/renamed
+// verity enabled file or the corresponding Merkle tree file fails with the
+// verify error.
+func TestOpenDeletedFileFails(t *testing.T) {
+ testCases := []struct {
+ // Tests removing files is remove is true. Otherwise tests
+ // renaming files.
+ remove bool
+ // The original file is removed/renamed if changeFile is true.
+ changeFile bool
+ // The Merkle tree file is removed/renamed if changeMerkleFile
+ // is true.
+ changeMerkleFile bool
+ }{
+ {
+ remove: true,
+ changeFile: true,
+ changeMerkleFile: false,
+ },
+ {
+ remove: true,
+ changeFile: false,
+ changeMerkleFile: true,
+ },
+ {
+ remove: false,
+ changeFile: true,
+ changeMerkleFile: false,
+ },
+ {
+ remove: false,
+ changeFile: true,
+ changeMerkleFile: false,
+ },
+ }
+ for _, tc := range testCases {
+ t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) {
+ vfsObj, root, ctx, err := newVerityRoot(t)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
+ if tc.remove {
+ if tc.changeFile {
+ if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(filename),
+ }); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
+ }
+ if tc.changeMerkleFile {
+ if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + filename),
+ }); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
+ }
+ } else {
+ newFilename := "renamed-test-file"
+ if tc.changeFile {
+ if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(filename),
+ }, &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(newFilename),
+ }, &vfs.RenameOptions{}); err != nil {
+ t.Fatalf("RenameAt: %v", err)
+ }
+ }
+ if tc.changeMerkleFile {
+ if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + filename),
+ }, &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + newFilename),
+ }, &vfs.RenameOptions{}); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
+ }
+ }
+
+ // Ensure reopening the verity enabled file fails.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err != syserror.EIO {
+ t.Errorf("got OpenAt error: %v, expected EIO", err)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index 61c78569d..300b7ccce 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -7,11 +7,14 @@ go_library(
srcs = [
"cgroup.go",
"hostmm.go",
+ "membarrier.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/abi/linux",
"//pkg/fd",
"//pkg/log",
"//pkg/usermem",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/hostmm/membarrier.go b/pkg/sentry/hostmm/membarrier.go
new file mode 100644
index 000000000..4468d75f1
--- /dev/null
+++ b/pkg/sentry/hostmm/membarrier.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
+)
+
+var (
+ haveMembarrierGlobal = false
+ haveMembarrierPrivateExpedited = false
+)
+
+func init() {
+ supported, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */)
+ if e != 0 {
+ if e != syscall.ENOSYS {
+ log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error())
+ }
+ return
+ }
+ // We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to
+ // all CPUs running tasks that have previously invoked
+ // MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk.
+ // (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU
+ // grace period to elapse without bothering other CPUs.
+ // MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks
+ // sharing the caller's MM.)
+ if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 {
+ haveMembarrierGlobal = true
+ }
+ if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req {
+ if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+ log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error())
+ } else {
+ haveMembarrierPrivateExpedited = true
+ }
+ }
+}
+
+// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported.
+func HaveGlobalMemoryBarrier() bool {
+ return haveMembarrierGlobal
+}
+
+// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have
+// passed through a state where all memory accesses to user-space addresses
+// match program order between entry to and return from [GlobalMemoryBarrier]",
+// as for membarrier(2).
+//
+// Preconditions: HaveGlobalMemoryBarrier() == true.
+func GlobalMemoryBarrier() error {
+ if _, _, e := syscall.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 {
+ return e
+ }
+ return nil
+}
+
+// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported.
+func HaveProcessMemoryBarrier() bool {
+ return haveMembarrierPrivateExpedited
+}
+
+// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only
+// synchronizes with threads sharing a virtual address space (from the host OS'
+// perspective) with the calling thread.
+//
+// Preconditions: HaveProcessMemoryBarrier() == true.
+func ProcessMemoryBarrier() error {
+ if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+ return e
+ }
+ return nil
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 07bf39fed..5bba9de0b 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -15,6 +15,7 @@ go_library(
],
deps = [
"//pkg/context",
+ "//pkg/tcpip",
"//pkg/tcpip/stack",
],
)
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 2916a0644..f31277d30 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -15,7 +15,10 @@
// Package inet defines semantics for IP stacks.
package inet
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
// Stack represents a TCP/IP stack.
type Stack interface {
@@ -29,9 +32,13 @@ type Stack interface {
InterfaceAddrs() map[int32][]InterfaceAddr
// AddInterfaceAddr adds an address to the network interface identified by
- // index.
+ // idx.
AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+ // RemoveInterfaceAddr removes an address from the network interface
+ // identified by idx.
+ RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error
+
// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
SupportsIPv6() bool
@@ -56,6 +63,12 @@ type Stack interface {
// settings.
SetTCPSACKEnabled(enabled bool) error
+ // TCPRecovery returns the TCP loss detection algorithm.
+ TCPRecovery() (TCPLossRecovery, error)
+
+ // SetTCPRecovery attempts to change TCP loss detection algorithm.
+ SetTCPRecovery(recovery TCPLossRecovery) error
+
// Statistics reports stack statistics.
Statistics(stat interface{}, arg string) error
@@ -74,6 +87,12 @@ type Stack interface {
// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
// for restoring a stack after a save.
RestoreCleanupEndpoints([]stack.TransportEndpoint)
+
+ // Forwarding returns if packet forwarding between NICs is enabled.
+ Forwarding(protocol tcpip.NetworkProtocolNumber) bool
+
+ // SetForwarding enables or disables packet forwarding between NICs.
+ SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error
}
// Interface contains information about a network interface.
@@ -189,3 +208,14 @@ type StatSNMPUDP [8]uint64
// StatSNMPUDPLite describes UdpLite line of /proc/net/snmp.
type StatSNMPUDPLite [8]uint64
+
+// TCPLossRecovery indicates TCP loss detection and recovery methods to use.
+type TCPLossRecovery int32
+
+// Loss recovery constants from include/net/tcp.h which are used to set
+// /proc/sys/net/ipv4/tcp_recovery.
+const (
+ TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota
+ TCP_RACK_STATIC_REO_WND
+ TCP_RACK_NO_DUPTHRESH
+)
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index d8961fc94..9ebeba8a3 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -14,7 +14,13 @@
package inet
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
// TestStack is a dummy implementation of Stack for tests.
type TestStack struct {
@@ -25,6 +31,8 @@ type TestStack struct {
TCPRecvBufSize TCPBufferSize
TCPSendBufSize TCPBufferSize
TCPSACKFlag bool
+ Recovery TCPLossRecovery
+ IPForwarding bool
}
// NewTestStack returns a TestStack with no network interfaces. The value of
@@ -53,6 +61,24 @@ func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
return nil
}
+// RemoveInterfaceAddr implements Stack.RemoveInterfaceAddr.
+func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error {
+ interfaceAddrs, ok := s.InterfaceAddrsMap[idx]
+ if !ok {
+ return fmt.Errorf("unknown idx: %d", idx)
+ }
+
+ var filteredAddrs []InterfaceAddr
+ for _, interfaceAddr := range interfaceAddrs {
+ if !bytes.Equal(interfaceAddr.Addr, addr.Addr) {
+ filteredAddrs = append(filteredAddrs, addr)
+ }
+ }
+ s.InterfaceAddrsMap[idx] = filteredAddrs
+
+ return nil
+}
+
// SupportsIPv6 implements Stack.SupportsIPv6.
func (s *TestStack) SupportsIPv6() bool {
return s.SupportsIPv6Flag
@@ -91,6 +117,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
return nil
}
+// TCPRecovery implements Stack.TCPRecovery.
+func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) {
+ return s.Recovery, nil
+}
+
+// SetTCPRecovery implements Stack.SetTCPRecovery.
+func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error {
+ s.Recovery = recovery
+ return nil
+}
+
// Statistics implements inet.Stack.Statistics.
func (s *TestStack) Statistics(stat interface{}, arg string) error {
return nil
@@ -116,3 +153,14 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ return s.IPForwarding
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+ s.IPForwarding = enable
+ return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a28eab8b8..90dd4a047 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -69,8 +69,63 @@ go_template_instance(
prefix = "socket",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*SocketEntry",
- "Linker": "*SocketEntry",
+ "Element": "*SocketRecordVFS1",
+ "Linker": "*SocketRecordVFS1",
+ },
+)
+
+go_template_instance(
+ name = "fd_table_refs",
+ out = "fd_table_refs.go",
+ package = "kernel",
+ prefix = "FDTable",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "FDTable",
+ },
+)
+
+go_template_instance(
+ name = "fs_context_refs",
+ out = "fs_context_refs.go",
+ package = "kernel",
+ prefix = "FSContext",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "FSContext",
+ },
+)
+
+go_template_instance(
+ name = "ipc_namespace_refs",
+ out = "ipc_namespace_refs.go",
+ package = "kernel",
+ prefix = "IPCNamespace",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "IPCNamespace",
+ },
+)
+
+go_template_instance(
+ name = "process_group_refs",
+ out = "process_group_refs.go",
+ package = "kernel",
+ prefix = "ProcessGroup",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "ProcessGroup",
+ },
+)
+
+go_template_instance(
+ name = "session_refs",
+ out = "session_refs.go",
+ package = "kernel",
+ prefix = "Session",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "Session",
},
)
@@ -85,11 +140,17 @@ go_library(
name = "kernel",
srcs = [
"abstract_socket_namespace.go",
+ "aio.go",
"context.go",
"fd_table.go",
+ "fd_table_refs.go",
"fd_table_unsafe.go",
"fs_context.go",
+ "fs_context_refs.go",
"ipc_namespace.go",
+ "ipc_namespace_refs.go",
+ "kcov.go",
+ "kcov_unsafe.go",
"kernel.go",
"kernel_opts.go",
"kernel_state.go",
@@ -98,6 +159,7 @@ go_library(
"pending_signals_state.go",
"posixtimer.go",
"process_group_list.go",
+ "process_group_refs.go",
"ptrace.go",
"ptrace_amd64.go",
"ptrace_arm64.go",
@@ -105,6 +167,7 @@ go_library(
"seccomp.go",
"seqatomic_taskgoroutineschedinfo_unsafe.go",
"session_list.go",
+ "session_refs.go",
"sessions.go",
"signal.go",
"signal_handlers.go",
@@ -131,6 +194,7 @@ go_library(
"task_stop.go",
"task_syscall.go",
"task_usermem.go",
+ "task_work.go",
"thread_group.go",
"threads.go",
"timekeeper.go",
@@ -145,22 +209,27 @@ go_library(
"gvisor.dev/gvisor/pkg/sentry/device",
"gvisor.dev/gvisor/pkg/tcpip",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
":uncaught_signal_go_proto",
"//pkg/abi",
"//pkg/abi/linux",
"//pkg/amutex",
- "//pkg/binary",
"//pkg/bits",
"//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
+ "//pkg/coverage",
"//pkg/cpuid",
"//pkg/eventchannel",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/secio",
"//pkg/sentry/arch",
@@ -199,6 +268,7 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/state",
"//pkg/state/statefile",
+ "//pkg/state/wire",
"//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
@@ -206,7 +276,6 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 920fe4329..0ddbe5ff6 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -15,28 +15,21 @@
package kernel
import (
+ "fmt"
"syscall"
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sync"
)
// +stateify savable
type abstractEndpoint struct {
- ep transport.BoundEndpoint
- wr *refs.WeakRef
- name string
- ns *AbstractSocketNamespace
-}
-
-// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (e *abstractEndpoint) WeakRefGone() {
- e.ns.mu.Lock()
- if e.ns.endpoints[e.name].ep == e.ep {
- delete(e.ns.endpoints, e.name)
- }
- e.ns.mu.Unlock()
+ ep transport.BoundEndpoint
+ socket refsvfs2.RefCounter
+ name string
+ ns *AbstractSocketNamespace
}
// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
@@ -45,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone() {
type AbstractSocketNamespace struct {
mu sync.Mutex `state:"nosave"`
- // Keeps mapping from name to endpoint.
+ // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold
+ // any references on any sockets that it contains; when retrieving a socket,
+ // TryIncRef() must be called in case the socket is concurrently being
+ // destroyed. It is the responsibility of the socket to remove itself from the
+ // abstract socket namespace when it is destroyed.
endpoints map[string]abstractEndpoint
}
@@ -57,16 +54,16 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
}
// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
-// its backing object.
+// its backing socket.
type boundEndpoint struct {
transport.BoundEndpoint
- rc refs.RefCounter
+ socket refsvfs2.RefCounter
}
// Release implements transport.BoundEndpoint.Release.
-func (e *boundEndpoint) Release() {
- e.rc.DecRef()
- e.BoundEndpoint.Release()
+func (e *boundEndpoint) Release(ctx context.Context) {
+ e.socket.DecRef(ctx)
+ e.BoundEndpoint.Release(ctx)
}
// BoundEndpoint retrieves the endpoint bound to the given name. The return
@@ -80,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
return nil
}
- rc := ep.wr.Get()
- if rc == nil {
- delete(a.endpoints, name)
+ if !ep.socket.TryIncRef() {
+ // The socket has reached zero references and is being destroyed.
return nil
}
- return &boundEndpoint{ep.ep, rc}
+ return &boundEndpoint{ep.ep, ep.socket}
}
// Bind binds the given socket.
//
-// When the last reference managed by rc is dropped, ep may be removed from the
+// When the last reference managed by socket is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
+ // Check if there is already a socket (which has not yet been destroyed) bound at name.
if ep, ok := a.endpoints[name]; ok {
- if rc := ep.wr.Get(); rc != nil {
- rc.DecRef()
+ if ep.socket.TryIncRef() {
+ ep.socket.DecRef(ctx)
return syscall.EADDRINUSE
}
}
ae := abstractEndpoint{ep: ep, name: name, ns: a}
- ae.wr = refs.NewWeakRef(rc, &ae)
+ ae.socket = socket
a.endpoints[name] = ae
return nil
}
+
+// Remove removes the specified socket at name from the abstract socket
+// namespace, if it has not yet been replaced.
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ ep, ok := a.endpoints[name]
+ if !ok {
+ // We never delete a map entry apart from a socket's destructor (although the
+ // map entry may be overwritten). Therefore, a socket should exist, even if it
+ // may not be the one we expect.
+ panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name))
+ }
+
+ // A Bind() operation may race with callers of Remove(), e.g. in the
+ // following case:
+ // socket1 reaches zero references and begins destruction
+ // a.Bind("foo", ep, socket2) replaces socket1 with socket2
+ // socket1's destructor calls a.Remove("foo", socket1)
+ //
+ // Therefore, we need to check that the socket at name is what we expect
+ // before modifying the map.
+ if ep.socket == socket {
+ delete(a.endpoints, name)
+ }
+}
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
new file mode 100644
index 000000000..0ac78c0b8
--- /dev/null
+++ b/pkg/sentry/kernel/aio.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+)
+
+// AIOCallback is an function that does asynchronous I/O on behalf of a task.
+type AIOCallback func(context.Context)
+
+// QueueAIO queues an AIOCallback which will be run asynchronously.
+func (t *Task) QueueAIO(cb AIOCallback) {
+ ctx := taskAsyncContext{t: t}
+ wg := &t.TaskSet().aioGoroutines
+ wg.Add(1)
+ go func() {
+ cb(ctx)
+ wg.Done()
+ }()
+}
+
+type taskAsyncContext struct {
+ context.NoopSleeper
+ t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+ ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+ ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+ ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+ return ctx.t.IsLogging(level)
+}
+
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+ return ctx.t.Deadline()
+}
+
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+ return ctx.t.Done()
+}
+
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+ return ctx.t.Err()
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+ return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
"id_map_set.go",
"user_namespace.go",
],
+ marshal = True,
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
}
return NewAnonymousCredentials()
}
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+ return &authContext{ctx, creds}
+}
+
+type authContext struct {
+ context.Context
+ creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCredentials:
+ return ac.creds
+ default:
+ return ac.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index e057d2c6d..6862f2ef5 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
}
return NoID, syserror.EPERM
}
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+ kuid := c.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+ c.RealKUID = kuid
+ c.EffectiveKUID = kuid
+ c.SavedKUID = kuid
+ return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+ kgid := c.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ c.RealKGID = kgid
+ c.EffectiveKGID = kgid
+ c.SavedKGID = kgid
+ return nil
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
)
// UID is a user ID in an unspecified user namespace.
+//
+// +marshal
type UID uint32
// GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
type GID uint32
// In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 0c40bf315..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -18,7 +18,6 @@ import (
"time"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
)
// contextID is the kernel package's type for context.Context.Value keys.
@@ -82,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
}
// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
if v := ctx.Value(CtxIPCNamespace); v != nil {
return v.(*IPCNamespace)
@@ -113,55 +113,3 @@ func (*Task) Done() <-chan struct{} {
func (*Task) Err() error {
return nil
}
-
-// AsyncContext returns a context.Context that may be used by goroutines that
-// do work on behalf of t and therefore share its contextual values, but are
-// not t's task goroutine (e.g. asynchronous I/O).
-func (t *Task) AsyncContext() context.Context {
- return taskAsyncContext{t: t}
-}
-
-type taskAsyncContext struct {
- context.NoopSleeper
- t *Task
-}
-
-// Debugf implements log.Logger.Debugf.
-func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
- ctx.t.Debugf(format, v...)
-}
-
-// Infof implements log.Logger.Infof.
-func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
- ctx.t.Infof(format, v...)
-}
-
-// Warningf implements log.Logger.Warningf.
-func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
- ctx.t.Warningf(format, v...)
-}
-
-// IsLogging implements log.Logger.IsLogging.
-func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
- return ctx.t.IsLogging(level)
-}
-
-// Deadline implements context.Context.Deadline.
-func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
- return ctx.t.Deadline()
-}
-
-// Done implements context.Context.Done.
-func (ctx taskAsyncContext) Done() <-chan struct{} {
- return ctx.t.Done()
-}
-
-// Err implements context.Context.Err.
-func (ctx taskAsyncContext) Err() error {
- return ctx.t.Err()
-}
-
-// Value implements context.Context.Value.
-func (ctx taskAsyncContext) Value(key interface{}) interface{} {
- return ctx.t.Value(key)
-}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 3d78cd48f..15519f0df 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -76,8 +76,8 @@ type pollEntry struct {
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
// weakReferenceGone is called when the file in the weak reference is destroyed.
// The poll entry is removed in response to this.
-func (p *pollEntry) WeakRefGone() {
- p.epoll.RemoveEntry(p.id)
+func (p *pollEntry) WeakRefGone(ctx context.Context) {
+ p.epoll.RemoveEntry(ctx, p.id)
}
// EventPoll holds all the state associated with an event poll object, that is,
@@ -107,7 +107,7 @@ type EventPoll struct {
// different lock to avoid circular lock acquisition order involving
// the wait queue mutexes and mu. The full order is mu, observed file
// wait queue mutex, then listsMu; this allows listsMu to be acquired
- // when readyCallback is called.
+ // when (*pollEntry).Callback is called.
//
// An entry is always in one of the following lists:
// readyList -- when there's a chance that it's ready to have
@@ -116,7 +116,7 @@ type EventPoll struct {
// readEvents() functions always call the entry's file
// Readiness() function to confirm it's ready.
// waitingList -- when there's no chance that the entry is ready,
- // so it's waiting for the readyCallback to be called
+ // so it's waiting for the (*pollEntry).Callback to be called
// on it before it gets moved to the readyList.
// disabledList -- when the entry is disabled. This happens when
// a one-shot entry gets delivered via readEvents().
@@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File {
// name matches fs/eventpoll.c:epoll_create1.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
files: make(map[FileIdentifier]*pollEntry),
})
}
// Release implements fs.FileOperations.Release.
-func (e *EventPoll) Release() {
+func (e *EventPoll) Release(ctx context.Context) {
// We need to take the lock now because files may be attempting to
// remove entries in parallel if they get destroyed.
e.mu.Lock()
@@ -160,7 +160,7 @@ func (e *EventPoll) Release() {
// Go through all entries and clean up.
for _, entry := range e.files {
entry.id.File.EventUnregister(&entry.waiter)
- entry.file.Drop()
+ entry.file.Drop(ctx)
}
e.files = nil
}
@@ -269,21 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
return ret
}
-// readyCallback is called when one of the files we're polling becomes ready. It
-// moves said file to the readyList if it's currently in the waiting list.
-type readyCallback struct{}
-
// Callback implements waiter.EntryCallback.Callback.
-func (*readyCallback) Callback(w *waiter.Entry) {
- entry := w.Context.(*pollEntry)
- e := entry.epoll
+//
+// Callback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+func (p *pollEntry) Callback(*waiter.Entry) {
+ e := p.epoll
e.listsMu.Lock()
- if entry.curList == &e.waitingList {
- e.waitingList.Remove(entry)
- e.readyList.PushBack(entry)
- entry.curList = &e.readyList
+ if p.curList == &e.waitingList {
+ e.waitingList.Remove(p)
+ e.readyList.PushBack(p)
+ p.curList = &e.readyList
e.listsMu.Unlock()
e.Notify(waiter.EventIn)
@@ -310,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
// Check if the file happens to already be in a ready state.
ready := f.Readiness(entry.mask) & entry.mask
if ready != 0 {
- (*readyCallback).Callback(nil, &entry.waiter)
+ entry.Callback(&entry.waiter)
}
}
@@ -380,10 +378,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev
userData: data,
epoll: e,
flags: flags,
- waiter: waiter.Entry{Callback: &readyCallback{}},
mask: mask,
}
- entry.waiter.Context = entry
+ entry.waiter.Callback = entry
e.files[id] = entry
entry.file = refs.NewWeakRef(id.File, entry)
@@ -406,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
}
// Unregister the old mask and remove entry from the list it's in, so
- // readyCallback is guaranteed to not be called on this entry anymore.
+ // (*pollEntry).Callback is guaranteed to not be called on this entry anymore.
entry.id.File.EventUnregister(&entry.waiter)
// Remove entry from whatever list it's in. This ensure that no other
@@ -426,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
}
// RemoveEntry a files from the collection of observed files.
-func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error {
e.mu.Lock()
defer e.mu.Unlock()
@@ -448,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
// Remove file from map, and drop weak reference.
delete(e.files, id)
- entry.file.Drop()
+ entry.file.Drop(ctx)
return nil
}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index 8e9f200d0..7c61e0258 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -21,8 +21,7 @@ import (
// afterLoad is invoked by stateify.
func (p *pollEntry) afterLoad() {
- p.waiter = waiter.Entry{Callback: &readyCallback{}}
- p.waiter.Context = p
+ p.waiter.Callback = p
p.file = refs.NewWeakRef(p.id.File, p)
p.id.File.EventRegister(&p.waiter, p.mask)
}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index 22630e9c5..55b505593 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) {
f := filetest.NewTestFile(t)
id := FileIdentifier{f, 12}
- efile := NewEventPoll(contexttest.Context(t))
+ ctx := contexttest.Context(t)
+ efile := NewEventPoll(ctx)
e := efile.FileOperations.(*EventPoll)
if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
t.Fatalf("addEntry failed: %v", err)
@@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) {
}
// Destroy the file. Check that we get no more events.
- f.DecRef()
+ f.DecRef(ctx)
evt = e.ReadEvents(1)
if len(evt) != 0 {
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 87951adeb..bbf568dfc 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
// name matches fs/eventfd.c:eventfd_file_create.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]")
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
val: initVal,
semMode: semMode,
@@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) {
}
// Release implements fs.FileOperations.Release.
-func (e *EventOperations) Release() {
+func (e *EventOperations) Release(context.Context) {
e.mu.Lock()
defer e.mu.Unlock()
if e.hostfd >= 0 {
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index b9126e946..2b3955598 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -11,6 +11,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index d32c3e90a..153d2cd9b 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -20,15 +20,21 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
-// New creates a new FileAsync.
+// New creates a new fs.FileAsync.
func New() fs.FileAsync {
return &FileAsync{}
}
+// NewVFS2 creates a new vfs.FileAsync.
+func NewVFS2() vfs.FileAsync {
+ return &FileAsync{}
+}
+
// FileAsync sends signals when the registered file is ready for IO.
//
// +stateify savable
@@ -170,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern
a.recipientTG = nil
a.recipientPG = recipient
}
+
+// ClearOwner unsets the current signal recipient.
+func (a *FileAsync) ClearOwner() {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ a.requester = nil
+ a.recipientT = nil
+ a.recipientTG = nil
+ a.recipientPG = nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index dbfcef0fa..7aba31587 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,12 +23,12 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// FDFlags define flags for an individual descriptor.
@@ -77,11 +77,9 @@ type descriptor struct {
//
// +stateify savable
type FDTable struct {
- refs.AtomicRefCount
- k *Kernel
+ FDTableRefs
- // uid is a unique identifier.
- uid uint64
+ k *Kernel
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -100,7 +98,7 @@ type FDTable struct {
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
file: file,
fileVFS2: fileVFS2,
@@ -111,26 +109,30 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
}
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
- f.init() // Initialize table.
+ ctx := context.Background()
+ f.initNoLeakCheck() // Initialize table.
+ f.used = 0
for fd, d := range m {
- f.setAll(fd, d.file, d.fileVFS2, d.flags)
+ if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
+ }
// Note that we do _not_ need to acquire a extra table reference here. The
// table reference will already be accounted for in the file, so we drop the
// reference taken by set above.
switch {
case d.file != nil:
- d.file.DecRef()
+ d.file.DecRef(ctx)
case d.fileVFS2 != nil:
- d.fileVFS2.DecRef()
+ d.fileVFS2.DecRef(ctx)
}
}
}
// drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
// Release locks.
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
// Send inotify events.
d := file.Dirent
@@ -146,61 +148,51 @@ func (f *FDTable) drop(file *fs.File) {
d.InotifyEvent(ev, 0)
// Drop the table reference.
- file.DecRef()
+ file.DecRef(ctx)
}
// dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
- // TODO(gvisor.dev/issue/1480): Release locks.
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
+ // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
+ // entire file.
+ err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
+ if err != nil && err != syserror.ENOLCK {
+ panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+ }
// Generate inotify events.
ev := uint32(linux.IN_CLOSE_NOWRITE)
if file.IsWritable() {
ev = linux.IN_CLOSE_WRITE
}
- file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent)
- // Drop the table reference.
- file.DecRef()
-}
-
-// ID returns a unique identifier for this FDTable.
-func (f *FDTable) ID() uint64 {
- return f.uid
+ // Drop the table's reference.
+ file.DecRef(ctx)
}
// NewFDTable allocates a new FDTable that may be used by tasks in k.
func (k *Kernel) NewFDTable() *FDTable {
- f := &FDTable{
- k: k,
- uid: atomic.AddUint64(&k.fdMapUids, 1),
- }
+ f := &FDTable{k: k}
f.init()
return f
}
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy() {
- f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
- return true
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
+func (f *FDTable) DecRef(ctx context.Context) {
+ f.FDTableRefs.DecRef(func() {
+ f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+ return true
+ })
})
}
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDTable) DecRef() {
- f.DecRefWithDestructor(f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
- size := atomic.LoadInt32(&f.used)
- return int(size)
-}
-
// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
// retries tracks the number of failed TryIncRef attempts for the same FD.
retries := 0
fd := int32(0)
@@ -219,7 +211,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, file, nil, flags)
- file.DecRef()
+ file.DecRef(ctx)
case fileVFS2 != nil:
if !fileVFS2.TryIncRef() {
retries++
@@ -229,7 +221,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, nil, fileVFS2, flags)
- fileVFS2.DecRef()
+ fileVFS2.DecRef(ctx)
}
retries = 0
fd++
@@ -239,7 +231,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
// String is a stringer for FDTable.
func (f *FDTable) String() string {
var buf strings.Builder
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ ctx := context.Background()
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
switch {
case file != nil:
n, _ := file.Dirent.FullName(nil /* root */)
@@ -247,7 +240,11 @@ func (f *FDTable) String() string {
case fileVFS2 != nil:
vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
- name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+ vd := fileVFS2.VirtualDentry()
+ if vd.Dentry() == nil {
+ panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2))
+ }
+ name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
if err != nil {
fmt.Fprintf(&buf, "<err: %v>\n", err)
return
@@ -282,7 +279,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -292,15 +288,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
- f.set(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.set(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.set(i, nil, FDFlags{}) // Zap entry.
+ f.set(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.set() that
+ // originally installed the file. Don't call f.drop()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -310,6 +316,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -337,7 +344,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
}
f.mu.Lock()
- defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
@@ -347,15 +353,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.getVFS2(i); d == nil {
- f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
- fds = append(fds, i) // Record the file descriptor.
+ // Set the descriptor.
+ f.setVFS2(ctx, i, files[len(fds)], flags)
+ fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
- f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ f.setVFS2(ctx, i, nil, FDFlags{})
+ }
+ f.mu.Unlock()
+
+ // Drop the reference taken by the call to f.setVFS2() that
+ // originally installed the file. Don't call f.dropVFS2()
+ // (generating inotify events, etc.) since the file should
+ // appear to have never been inserted into f.
+ for _, file := range files[:len(fds)] {
+ file.DecRef(ctx)
}
return nil, syscall.EMFILE
}
@@ -365,6 +381,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
f.next = fds[len(fds)-1] + 1
}
+ f.mu.Unlock()
return fds, nil
}
@@ -400,7 +417,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
}
for fd < end {
if d, _, _ := f.getVFS2(fd); d == nil {
- f.setVFS2(fd, file, flags)
+ f.setVFS2(ctx, fd, file, flags)
if fd == f.next {
// Update next search start position.
f.next = fd + 1
@@ -416,40 +433,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
- return f.newFDAt(ctx, fd, file, nil, flags)
+ df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+ if err != nil {
+ return err
+ }
+ if df != nil {
+ f.drop(ctx, df)
+ }
+ return nil
}
// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
- return f.newFDAt(ctx, fd, nil, file, flags)
+ _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+ if err != nil {
+ return err
+ }
+ if dfVFS2 != nil {
+ f.dropVFS2(ctx, dfVFS2)
+ }
+ return nil
}
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
if fd < 0 {
// Don't accept negative FDs.
- return syscall.EBADF
+ return nil, nil, syscall.EBADF
}
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
- return syscall.EMFILE
+ return nil, nil, syscall.EMFILE
}
}
// Install the entry.
f.mu.Lock()
defer f.mu.Unlock()
- f.setAll(fd, file, fileVFS2, flags)
- return nil
+
+ df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+ return df, dfVFS2, nil
}
// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -465,7 +497,30 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
}
// Update the flags.
- f.set(fd, file, flags)
+ f.set(ctx, fd, file, flags)
+ return nil
+}
+
+// SetFlagsVFS2 sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ file, _, _ := f.getVFS2(fd)
+ if file == nil {
+ // No file found.
+ return syscall.EBADF
+ }
+
+ // Update the flags.
+ f.setVFS2(ctx, fd, file, flags)
return nil
}
@@ -523,50 +578,23 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
//
// Precondition: The caller must be running on the task goroutine, or Task.mu
// must be locked.
-func (f *FDTable) GetFDs() []int32 {
+func (f *FDTable) GetFDs(ctx context.Context) []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
- f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
}
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs() []*fs.File {
- files := make([]*fs.File, 0, f.Size())
- f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
- files := make([]*vfs.FileDescription, 0, f.Size())
- f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
- file.IncRef() // Acquire a reference for caller.
- files = append(files, file)
- })
- return files
-}
-
// Fork returns an independent FDTable.
-func (f *FDTable) Fork() *FDTable {
+func (f *FDTable) Fork(ctx context.Context) *FDTable {
clone := f.k.NewFDTable()
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- switch {
- case file != nil:
- clone.set(fd, file, flags)
- case fileVFS2 != nil:
- clone.setVFS2(fd, fileVFS2, flags)
+ if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+ panic("VFS1 or VFS2 files set")
}
})
return clone
@@ -575,13 +603,12 @@ func (f *FDTable) Fork() *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
return nil, nil
}
f.mu.Lock()
- defer f.mu.Unlock()
// Update current available position.
if fd < f.next {
@@ -597,24 +624,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
case orig2 != nil:
orig2.IncRef()
}
+
if orig != nil || orig2 != nil {
- f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
}
+ f.mu.Unlock()
+
+ if orig != nil {
+ f.drop(ctx, orig)
+ }
+ if orig2 != nil {
+ f.dropVFS2(ctx, orig2)
+ }
+
return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
- f.mu.Lock()
- defer f.mu.Unlock()
+func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
+ // TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+ var files []*fs.File
+ var filesVFS2 []*vfs.FileDescription
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.mu.Lock()
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
- f.set(fd, nil, FDFlags{}) // Clear from table.
+ df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+ if df != nil {
+ files = append(files, df)
+ }
+ if dfVFS2 != nil {
+ filesVFS2 = append(filesVFS2, dfVFS2)
+ }
// Update current available position.
if fd < f.next {
f.next = fd
}
}
})
+ f.mu.Unlock()
+
+ for _, file := range files {
+ f.drop(ctx, file)
+ }
+
+ for _, file := range filesVFS2 {
+ f.dropVFS2(ctx, file)
+ }
}
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 29f95a2c4..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
}
i := int32(2)
- fdTable.Remove(i)
+ fdTable.Remove(ctx, i)
if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
} else {
for _, fd := range fds {
- fdTable.Remove(fd)
+ fdTable.Remove(ctx, fd)
}
}
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
}
- ref, _ := fdTable.Remove(1)
+ ref, _ := fdTable.Remove(ctx, 1)
if ref == nil {
t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
}
- ref.DecRef()
+ ref.DecRef(ctx)
- if ref, _ := fdTable.Remove(1); ref != nil {
+ if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
}
})
@@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) {
b.StartTimer() // Benchmark.
for i := 0; i < b.N; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
})
}
@@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
defer wg.Done()
for i := 0; i < each; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
}()
}
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..3476551f3 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -30,12 +31,21 @@ type descriptorTable struct {
slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
}
-// init initializes the table.
-func (f *FDTable) init() {
+// initNoLeakCheck initializes the table without enabling leak checking.
+//
+// This is used when loading an FDTable after S/R, during which the ref count
+// object itself will enable leak checking if necessary.
+func (f *FDTable) initNoLeakCheck() {
var slice []unsafe.Pointer // Empty slice.
atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
}
+// init initializes the table with leak checking.
+func (f *FDTable) init() {
+ f.initNoLeakCheck()
+ f.EnableLeakCheck()
+}
+
// get gets a file entry.
//
// The boolean indicates whether this was in range.
@@ -76,33 +86,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
return d.file, d.fileVFS2, d.flags, true
}
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+ slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+ return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
- f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+ dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+ return dropFile
}
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
//
// Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
- f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+ _, dropFile := f.setAll(ctx, fd, nil, file, flags)
+ return dropFile
}
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
//
// Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
if file != nil && fileVFS2 != nil {
panic("VFS1 and VFS2 files set")
}
@@ -145,25 +159,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
}
}
- // Drop the table reference.
+ // Adjust used.
+ switch {
+ case orig == nil && desc != nil:
+ atomic.AddInt32(&f.used, 1)
+ case orig != nil && desc == nil:
+ atomic.AddInt32(&f.used, -1)
+ }
+
if orig != nil {
switch {
case orig.file != nil:
if desc == nil || desc.file != orig.file {
- f.drop(orig.file)
+ return orig.file, nil
}
case orig.fileVFS2 != nil:
if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
- f.dropVFS2(orig.fileVFS2)
+ return nil, orig.fileVFS2
}
}
}
-
- // Adjust used.
- switch {
- case orig == nil && desc != nil:
- atomic.AddInt32(&f.used, 1)
- case orig != nil && desc == nil:
- atomic.AddInt32(&f.used, -1)
- }
+ return nil, nil
}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 47f78df9a..41fb2a784 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -17,7 +17,7 @@ package kernel
import (
"fmt"
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -29,7 +29,7 @@ import (
//
// +stateify savable
type FSContext struct {
- refs.AtomicRefCount
+ FSContextRefs
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
cwd: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
@@ -76,96 +76,104 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
cwdVFS2: cwd,
umask: umask,
}
- f.EnableLeakCheck("kernel.FSContext")
+ f.EnableLeakCheck()
return &f
}
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
//
-// This will call DecRef on both root and cwd Dirents. If either call to
-// DecRef returns an error, then it will be propagated. If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
//
// Note that there may still be calls to WorkingDirectory() or RootDirectory()
// (that return nil). This is because valid references may still be held via
// proc files or other mechanisms.
-func (f *FSContext) destroy() {
- // Hold f.mu so that we don't race with RootDirectory() and
- // WorkingDirectory().
- f.mu.Lock()
- defer f.mu.Unlock()
-
- if VFS2Enabled {
- f.rootVFS2.DecRef()
- f.rootVFS2 = vfs.VirtualDentry{}
- f.cwdVFS2.DecRef()
- f.cwdVFS2 = vfs.VirtualDentry{}
- } else {
- f.root.DecRef()
- f.root = nil
- f.cwd.DecRef()
- f.cwd = nil
- }
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FSContext) DecRef() {
- f.DecRefWithDestructor(f.destroy)
+func (f *FSContext) DecRef(ctx context.Context) {
+ f.FSContextRefs.DecRef(func() {
+ // Hold f.mu so that we don't race with RootDirectory() and
+ // WorkingDirectory().
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ if VFS2Enabled {
+ f.rootVFS2.DecRef(ctx)
+ f.rootVFS2 = vfs.VirtualDentry{}
+ f.cwdVFS2.DecRef(ctx)
+ f.cwdVFS2 = vfs.VirtualDentry{}
+ } else {
+ f.root.DecRef(ctx)
+ f.root = nil
+ f.cwd.DecRef(ctx)
+ f.cwd = nil
+ }
+ })
}
// Fork forks this FSContext.
//
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
func (f *FSContext) Fork() *FSContext {
f.mu.Lock()
defer f.mu.Unlock()
if VFS2Enabled {
+ if !f.cwdVFS2.Ok() {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwdVFS2.IncRef()
f.rootVFS2.IncRef()
} else {
+ if f.cwd == nil {
+ panic("FSContext.Fork() called after destroy")
+ }
f.cwd.IncRef()
f.root.IncRef()
}
- return &FSContext{
+ ctx := &FSContext{
cwd: f.cwd,
root: f.root,
cwdVFS2: f.cwdVFS2,
rootVFS2: f.rootVFS2,
umask: f.umask,
}
+ ctx.EnableLeakCheck()
+ return ctx
}
// WorkingDirectory returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
- f.cwd.IncRef()
+ if f.cwd != nil {
+ f.cwd.IncRef()
+ }
return f.cwd
}
// WorkingDirectoryVFS2 returns the current working directory.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
- f.cwdVFS2.IncRef()
+ if f.cwdVFS2.Ok() {
+ f.cwdVFS2.IncRef()
+ }
return f.cwdVFS2
}
// SetWorkingDirectory sets the current working directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetWorkingDirectory called with nil dirent")
}
@@ -180,27 +188,31 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
old := f.cwd
f.cwd = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetWorkingDirectoryVFS2 sets the current working directory.
// This will take an extra reference on the VirtualDentry.
//
-// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
f.mu.Lock()
defer f.mu.Unlock()
+ if !f.cwdVFS2.Ok() {
+ panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+ }
+
old := f.cwdVFS2
f.cwdVFS2 = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// RootDirectory returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
func (f *FSContext) RootDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
@@ -212,21 +224,23 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
// RootDirectoryVFS2 returns the current filesystem root.
//
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
f.mu.Lock()
defer f.mu.Unlock()
- f.rootVFS2.IncRef()
+ if f.rootVFS2.Ok() {
+ f.rootVFS2.IncRef()
+ }
return f.rootVFS2
}
// SetRootDirectory sets the root directory.
// This will take an extra reference on the Dirent.
//
-// This is not a valid call after free.
-func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetRootDirectory called with nil dirent")
}
@@ -241,13 +255,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
old := f.root
f.root = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
-// This is not a valid call after free.
-func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+// This is not a valid call after f is destroyed.
+func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
if !vd.Ok() {
panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
}
@@ -263,7 +277,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
vd.IncRef()
f.rootVFS2 = vd
f.mu.Unlock()
- old.DecRef()
+ old.DecRef(ctx)
}
// Umask returns the current umask.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index c5021f2db..daa2dae76 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -51,6 +51,7 @@ go_test(
srcs = ["futex_test.go"],
library = ":futex",
deps = [
+ "//pkg/context",
"//pkg/sync",
"//pkg/usermem",
],
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 732e66da4..e4dcc4d40 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -19,6 +19,7 @@ package futex
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -66,9 +67,9 @@ type Key struct {
Offset uint64
}
-func (k *Key) release() {
+func (k *Key) release(t Target) {
if k.MappingIdentity != nil {
- k.MappingIdentity.DecRef()
+ k.MappingIdentity.DecRef(t)
}
k.Mappable = nil
k.MappingIdentity = nil
@@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool {
// Target abstracts memory accesses and keys.
type Target interface {
+ context.Context
+
// SwapUint32 gives access to usermem.IO.SwapUint32.
SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
@@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) {
// bucket "to".
//
// Preconditions: b and to must be locked.
-func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
done := 0
for w := b.waiters.Front(); done < n && w != nil; {
if !w.key.matches(key) {
@@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
requeued := w
w = w.Next() // Next iteration.
b.waiters.Remove(requeued)
- requeued.key.release()
+ requeued.key.release(t)
requeued.key = nkey.clone()
to.waiters.PushBack(requeued)
requeued.bucket.Store(to)
@@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32
r := b.wakeLocked(&k, bitmask, n)
b.mu.Unlock()
- k.release()
+ k.release(t)
return r, nil
}
@@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, naddr, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
done := b1.wakeLocked(&k1, ^uint32(0), nwake)
// Requeue the number required.
- b1.requeueLocked(b2, &k1, &k2, nreq)
+ b1.requeueLocked(t, b2, &k1, &k2, nreq)
return done, nil
}
@@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, addr2, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// Perform our atomic check.
if err := check(t, addr, val); err != nil {
b.mu.Unlock()
- w.key.release()
+ w.key.release(t)
return err
}
@@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// WaitComplete must be called when a Waiter previously added by WaitPrepare is
// no longer eligible to be woken.
-func (m *Manager) WaitComplete(w *Waiter) {
+func (m *Manager) WaitComplete(w *Waiter, t Target) {
// Remove w from the bucket it's in.
for {
b := w.bucket.Load()
@@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
}
// Release references held by the waiter.
- w.key.release()
+ w.key.release(t)
}
// LockPI attempts to lock the futex following the Priority-inheritance futex
@@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri
success, err := m.lockPILocked(w, t, addr, tid, b, try)
if err != nil {
- w.key.release()
+ w.key.release(t)
b.mu.Unlock()
return false, err
}
if success || try {
// Release waiter if it's not going to be a wait.
- w.key.release()
+ w.key.release(t)
}
b.mu.Unlock()
return success, nil
@@ -717,10 +720,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
}
}
-// UnlockPI unlock the futex following the Priority-inheritance futex
-// rules. The address provided must contain the caller's TID. If there are
-// waiters, TID of the next waiter (FIFO) is set to the given address, and the
-// waiter woken up. If there are no waiters, 0 is set to the address.
+// UnlockPI unlocks the futex following the Priority-inheritance futex rules.
+// The address provided must contain the caller's TID. If there are waiters,
+// TID of the next waiter (FIFO) is set to the given address, and the waiter
+// woken up. If there are no waiters, 0 is set to the address.
func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
k, err := getKey(t, addr, private)
if err != nil {
@@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
err = m.unlockPILocked(t, addr, tid, b, &k)
- k.release()
+ k.release(t)
b.mu.Unlock()
return err
}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 7c5c7665b..d0128c548 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -22,6 +22,7 @@ import (
"testing"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -29,28 +30,33 @@ import (
// testData implements the Target interface, and allows us to
// treat the address passed for futex operations as an index in
// a byte slice for testing simplicity.
-type testData []byte
+type testData struct {
+ context.Context
+ data []byte
+}
const sizeofInt32 = 4
func newTestData(size uint) testData {
- return make([]byte, size)
+ return testData{
+ data: make([]byte, size),
+ }
}
func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
- val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new)
+ val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new)
return val, nil
}
func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
- if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) {
+ if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) {
return old, nil
}
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
@@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup.
if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 {
@@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff)
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup using the wrong bitmask.
if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 {
@@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) {
var ws [3]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform two wakeups.
@@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) {
// Start two waiters waiting for wakeup on different addresses.
w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform two wakeups on the second address.
if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 {
@@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address 0.
if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 {
@@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(0), which should succeed).
@@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(1), which should fail).
@@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(0), which
@@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(1), which
@@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -472,7 +478,7 @@ func (t *testMutex) Lock() {
for {
// Attempt to grab the lock.
if atomic.CompareAndSwapUint32(
- (*uint32)(unsafe.Pointer(&t.d[t.a])),
+ (*uint32)(unsafe.Pointer(&t.d.data[t.a])),
testMutexUnlocked,
testMutexLocked) {
// Lock held.
@@ -490,7 +496,7 @@ func (t *testMutex) Lock() {
panic("WaitPrepare returned unexpected error: " + err.Error())
}
<-w.C
- t.m.WaitComplete(w)
+ t.m.WaitComplete(w, t.d)
}
}
@@ -498,7 +504,7 @@ func (t *testMutex) Lock() {
// This will notify any waiters via the futex manager.
func (t *testMutex) Unlock() {
// Unlock.
- atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked)
+ atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked)
// Notify all waiters.
t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32)
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..b87e40dd1 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
//
// +stateify savable
type IPCNamespace struct {
+ IPCNamespaceRefs
+
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
@@ -33,11 +36,13 @@ type IPCNamespace struct {
// NewIPCNamespace creates a new IPC namespace.
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
- return &IPCNamespace{
+ ns := &IPCNamespace{
userNS: userNS,
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
+ ns.EnableLeakCheck()
+ return ns
}
// SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
+// DecRef implements refsvfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+ i.IPCNamespaceRefs.DecRef(func() {
+ i.shms.Release(ctx)
+ })
+}
+
// IPCNamespace returns the task's IPC namespace.
func (t *Task) IPCNamespace() *IPCNamespace {
t.mu.Lock()
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..4fcdfc541
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,338 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/coverage"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+ // mfp provides application memory. It is immutable after creation.
+ mfp pgalloc.MemoryFileProvider
+
+ // mu protects all of the fields below.
+ mu sync.RWMutex
+
+ // mode is the current kcov mode.
+ mode uint8
+
+ // size is the size of the mapping through which the kernel conveys coverage
+ // information to userspace.
+ size uint64
+
+ // owningTask is the task that currently owns coverage data on the system. The
+ // interface for kcov essentially requires that coverage is only going to a
+ // single task. Note that kcov should only generate coverage data for the
+ // owning task, but we currently generate global coverage.
+ owningTask *Task
+
+ // count is a locally cached version of the first uint64 in the kcov data,
+ // which is the number of subsequent entries representing PCs.
+ //
+ // It is used with kcovInode.countBlock(), to copy in/out the first element of
+ // the actual data in an efficient manner, avoid boilerplate, and prevent
+ // accidental garbage escapes by the temporary counts.
+ count uint64
+
+ mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+ return &Kcov{
+ mfp: k,
+ }
+}
+
+var coveragePool = sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 0)
+ },
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_TRACE_PC {
+ return
+ }
+
+ rw := &kcovReadWriter{
+ mf: kcov.mfp.MemoryFile(),
+ fr: kcov.mappable.FileRange(),
+ }
+
+ // Read in the PC count.
+ if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+ }
+
+ rw.off = 8 * (1 + kcov.count)
+ n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+ // Update the pc count, based on the number of entries written. Note that if
+ // we reached the end of the kcov area, we may not have written everything in
+ // output.
+ kcov.count += uint64(n / 8)
+ rw.off = 0
+ if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+ panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+ }
+
+ // Re-register for future work.
+ t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_DISABLED {
+ return syserror.EBUSY
+ }
+
+ // To simplify all the logic around mapping, we require that the length of the
+ // shared region is a multiple of the system page size.
+ if (8*size)&(usermem.PageSize-1) != 0 {
+ return syserror.EINVAL
+ }
+
+ // We need space for at least two uint64s to hold current position and a
+ // single PC.
+ if size < 2 || size > kcovAreaSizeMax {
+ return syserror.EINVAL
+ }
+
+ kcov.size = size
+ kcov.mode = linux.KCOV_MODE_INIT
+ return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+ if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+ return syserror.EINVAL
+ }
+
+ switch traceKind {
+ case linux.KCOV_TRACE_PC:
+ kcov.mode = linux.KCOV_MODE_TRACE_PC
+ case linux.KCOV_TRACE_CMP:
+ // We do not support KCOV_MODE_TRACE_CMP.
+ return syserror.ENOTSUP
+ default:
+ return syserror.EINVAL
+ }
+
+ if kcov.owningTask != nil && kcov.owningTask != t {
+ return syserror.EBUSY
+ }
+
+ kcov.owningTask = t
+ t.SetKcov(kcov)
+ t.RegisterWork(kcov)
+
+ // Clear existing coverage data; the task expects to read only coverage data
+ // from the time it is activated.
+ coverage.ClearCoverageData()
+ return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+ }
+
+ if t != kcov.owningTask {
+ return syserror.EINVAL
+ }
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ if kcov.mappable != nil {
+ kcov.mappable.DecRef(ctx)
+ kcov.mappable = nil
+ }
+ return nil
+}
+
+// Clear resets the mode and clears the owning task and memory mapping for kcov.
+// It is called when the fd corresponding to kcov is closed. Note that the mode
+// needs to be set so that the next call to kcov.TaskWork() will exit early.
+func (kcov *Kcov) Clear(ctx context.Context) {
+ kcov.mu.Lock()
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ if kcov.mappable != nil {
+ kcov.mappable.DecRef(ctx)
+ kcov.mappable = nil
+ }
+ kcov.mu.Unlock()
+}
+
+// OnTaskExit is called when the owning task exits. It is similar to
+// kcov.Clear(), except the memory mapping is not cleared, so that the same
+// mapping can be used in the future if kcov is enabled again by another task.
+func (kcov *Kcov) OnTaskExit() {
+ kcov.mu.Lock()
+ kcov.mode = linux.KCOV_MODE_INIT
+ kcov.owningTask = nil
+ kcov.mu.Unlock()
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ kcov.mu.Lock()
+ defer kcov.mu.Unlock()
+
+ if kcov.mode != linux.KCOV_MODE_INIT {
+ return syserror.EINVAL
+ }
+
+ if kcov.mappable == nil {
+ // Set up the kcov area.
+ fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+ if err != nil {
+ return err
+ }
+
+ // Get the thread id for the mmap name.
+ t := TaskFromContext(ctx)
+ if t == nil {
+ panic("ThreadFromContext returned nil")
+ }
+ // For convenience, a special mappable is used here. Note that these mappings
+ // will look different under /proc/[pid]/maps than they do on Linux.
+ kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+ }
+ kcov.mappable.IncRef()
+ opts.Mappable = kcov.mappable
+ opts.MappingIdentity = kcov.mappable
+ return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+ off uint64
+ mf *pgalloc.MemoryFile
+ fr memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ if dsts.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the read to the kcov range and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if rend := start + dsts.NumBytes(); rend < end {
+ end = rend
+ }
+
+ // Get internal mappings.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, bs)
+ rw.off += n
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ if srcs.IsEmpty() {
+ return 0, nil
+ }
+
+ // Limit the write to the kcov area and check for overflow.
+ if rw.fr.Length() <= rw.off {
+ return 0, io.EOF
+ }
+ start := rw.fr.Start + rw.off
+ end := rw.fr.Start + rw.fr.Length()
+ if wend := start + srcs.NumBytes(); wend < end {
+ end = wend
+ }
+
+ // Get internal mapping.
+ bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+ if err != nil {
+ return 0, err
+ }
+
+ // Copy to internal mapping.
+ n, err := safemem.CopySeq(bs, srcs)
+ rw.off += n
+ return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+ rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+ bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+ n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+ return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f8a0266b
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for kcov.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (kcov *Kcov) countBlock() safemem.BlockSeq {
+ return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5efeb3767..9b2be44d4 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -34,12 +34,12 @@ package kernel
import (
"errors"
"fmt"
- "io"
"path/filepath"
"sync/atomic"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/eventchannel"
@@ -73,6 +73,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/state/wire"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -81,6 +82,10 @@ import (
// easy access everywhere. To be removed once VFS2 becomes the default.
var VFS2Enabled = false
+// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
+// easy access everywhere. To be removed once FUSE is completed.
+var FUSEEnabled = false
+
// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
@@ -194,11 +199,6 @@ type Kernel struct {
// cpuClockTickerSetting is protected by runningTasksMu.
cpuClockTickerSetting ktime.Setting
- // fdMapUids is an ever-increasing counter for generating FDTable uids.
- //
- // fdMapUids is mutable, and is accessed using atomic memory operations.
- fdMapUids uint64
-
// uniqueID is used to generate unique identifiers.
//
// uniqueID is mutable, and is accessed using atomic memory operations.
@@ -221,13 +221,18 @@ type Kernel struct {
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
- // sockets is the list of all network sockets the system. Protected by
- // extMu.
+ // sockets is the list of all network sockets in the system.
+ // Protected by extMu.
+ // TODO(gvisor.dev/issue/1624): Only used by VFS1.
sockets socketList
- // nextSocketEntry is the next entry number to use in sockets. Protected
+ // socketsVFS2 records all network sockets in the system. Protected by
+ // extMu.
+ socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+ // nextSocketRecord is the next entry number to use in sockets. Protected
// by extMu.
- nextSocketEntry uint64
+ nextSocketRecord uint64
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -249,7 +254,7 @@ type Kernel struct {
// SpecialOpts contains special kernel options.
SpecialOpts
- // VFS keeps the filesystem state used across the kernel.
+ // vfs keeps the filesystem state used across the kernel.
vfs vfs.VirtualFilesystem
// hostMount is the Mount used for file descriptors that were imported
@@ -336,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
return fmt.Errorf("Timekeeper is nil")
}
if args.Timekeeper.clocks == nil {
- return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+ return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
}
if args.RootUserNamespace == nil {
return fmt.Errorf("RootUserNamespace is nil")
@@ -361,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.useHostCores = true
maxCPU, err := hostcpu.MaxPossibleCPU()
if err != nil {
- return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+ return fmt.Errorf("failed to get maximum CPU number: %v", err)
}
minAppCores := uint(maxCPU) + 1
if k.applicationCores < minAppCores {
@@ -377,7 +382,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.netlinkPorts = port.New()
if VFS2Enabled {
- if err := k.vfs.Init(); err != nil {
+ ctx := k.SupervisorContext()
+ if err := k.vfs.Init(ctx); err != nil {
return fmt.Errorf("failed to initialize VFS: %v", err)
}
@@ -385,19 +391,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create pipefs filesystem: %v", err)
}
- defer pipeFilesystem.DecRef()
+ defer pipeFilesystem.DecRef(ctx)
pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create pipefs mount: %v", err)
}
k.pipeMount = pipeMount
- tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+ tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
if err != nil {
return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
}
- defer tmpfsFilesystem.DecRef()
- defer tmpfsRoot.DecRef()
+ defer tmpfsFilesystem.DecRef(ctx)
+ defer tmpfsRoot.DecRef(ctx)
shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create tmpfs mount: %v", err)
@@ -408,12 +414,14 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create sockfs filesystem: %v", err)
}
- defer socketFilesystem.DecRef()
+ defer socketFilesystem.DecRef(ctx)
socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create sockfs mount: %v", err)
}
k.socketMount = socketMount
+
+ k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
}
return nil
@@ -422,56 +430,70 @@ func (k *Kernel) Init(args InitKernelArgs) error {
// SaveTo saves the state of k to w.
//
// Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w io.Writer) error {
+func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
saveStart := time.Now()
- ctx := k.SupervisorContext()
// Do not allow other Kernel methods to affect it while it's being saved.
k.extMu.Lock()
defer k.extMu.Unlock()
// Stop time.
- k.pauseTimeLocked()
- defer k.resumeTimeLocked()
+ k.pauseTimeLocked(ctx)
+ defer k.resumeTimeLocked(ctx)
// Evict all evictable MemoryFile allocations.
k.mf.StartEvictions()
k.mf.WaitForEvictions()
- // Flush write operations on open files so data reaches backing storage.
- // This must come after MemoryFile eviction since eviction may cause file
- // writes.
- if err := k.tasks.flushWritesToFiles(ctx); err != nil {
- return err
- }
+ if VFS2Enabled {
+ // Discard unsavable mappings, such as those for host file descriptors.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
+
+ // Prepare filesystems for saving. This must be done after
+ // invalidateUnsavableMappings(), since dropping memory mappings may
+ // affect filesystem state (e.g. page cache reference counts).
+ if err := k.vfs.PrepareSave(ctx); err != nil {
+ return err
+ }
+ } else {
+ // Flush cached file writes to backing storage. This must come after
+ // MemoryFile eviction since eviction may cause file writes.
+ if err := k.flushWritesToFiles(ctx); err != nil {
+ return err
+ }
- // Remove all epoll waiter objects from underlying wait queues.
- // NOTE: for programs to resume execution in future snapshot scenarios,
- // we will need to re-establish these waiter objects after saving.
- k.tasks.unregisterEpollWaiters()
+ // Remove all epoll waiter objects from underlying wait queues.
+ // NOTE: for programs to resume execution in future snapshot scenarios,
+ // we will need to re-establish these waiter objects after saving.
+ k.tasks.unregisterEpollWaiters(ctx)
- // Clear the dirent cache before saving because Dirents must be Loaded in a
- // particular order (parents before children), and Loading dirents from a cache
- // breaks that order.
- if err := k.flushMountSourceRefs(); err != nil {
- return err
- }
+ // Clear the dirent cache before saving because Dirents must be Loaded in a
+ // particular order (parents before children), and Loading dirents from a cache
+ // breaks that order.
+ if err := k.flushMountSourceRefs(ctx); err != nil {
+ return err
+ }
- // Ensure that all pending asynchronous work is complete:
- // - inode and mount release
- // - asynchronuous IO
- fs.AsyncBarrier()
-
- // Once all fs work has completed (flushed references have all been released),
- // reset mount mappings. This allows individual mounts to save how inodes map
- // to filesystem resources. Without this, fs.Inodes cannot be restored.
- fs.SaveInodeMappings()
-
- // Discard unsavable mappings, such as those for host file descriptors.
- // This must be done after waiting for "asynchronous fs work", which
- // includes async I/O that may touch application memory.
- if err := k.invalidateUnsavableMappings(ctx); err != nil {
- return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ // Ensure that all inode and mount release operations have completed.
+ fs.AsyncBarrier()
+
+ // Once all fs work has completed (flushed references have all been released),
+ // reset mount mappings. This allows individual mounts to save how inodes map
+ // to filesystem resources. Without this, fs.Inodes cannot be restored.
+ fs.SaveInodeMappings()
+
+ // Discard unsavable mappings, such as those for host file descriptors.
+ // This must be done after waiting for "asynchronous fs work", which
+ // includes async I/O that may touch application memory.
+ //
+ // TODO(gvisor.dev/issue/1624): This rationale is believed to be
+ // obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
+ // but this order is conservatively retained for VFS1.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
}
// Save the CPUID FeatureSet before the rest of the kernel so we can
@@ -480,23 +502,23 @@ func (k *Kernel) SaveTo(w io.Writer) error {
//
// N.B. This will also be saved along with the full kernel save below.
cpuidStart := time.Now()
- if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil {
+ if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
return err
}
log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
// Save the kernel state.
kernelStart := time.Now()
- var stats state.Stats
- if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil {
+ stats, err := state.Save(ctx, w, k)
+ if err != nil {
return err
}
- log.Infof("Kernel save stats: %s", &stats)
+ log.Infof("Kernel save stats: %s", stats.String())
log.Infof("Kernel save took [%s].", time.Since(kernelStart))
// Save the memory file's state.
memoryStart := time.Now()
- if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
+ if err := k.mf.SaveTo(ctx, w); err != nil {
return err
}
log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -508,7 +530,9 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
-func (k *Kernel) flushMountSourceRefs() error {
+//
+// Preconditions: !VFS2Enabled.
+func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -524,7 +548,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
@@ -534,12 +558,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// each task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
+func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -547,7 +566,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
if t.fdTable == nil {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
err = lastErr
}
@@ -556,9 +575,9 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
return err
}
-func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+// Preconditions: !VFS2Enabled.
+func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -580,6 +599,32 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
})
}
+// Preconditions: !VFS2Enabled.
+func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
+
+ // Tasks that belong to the same process could potentially point to the
+ // same FDTable. So we retain a map of processed ones to avoid
+ // processing the same FDTable multiple times.
+ processed := make(map[*FDTable]struct{})
+ for t := range ts.Root.tids {
+ // We can skip locking Task.mu here since the kernel is paused.
+ if t.fdTable == nil {
+ continue
+ }
+ if _, ok := processed[t.fdTable]; ok {
+ continue
+ }
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
+ }
+ })
+ processed[t.fdTable] = struct{}{}
+ }
+}
+
// Preconditions: The kernel must be paused.
func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
invalidated := make(map[*mm.MemoryManager]struct{})
@@ -605,38 +650,8 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
return nil
}
-func (ts *TaskSet) unregisterEpollWaiters() {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return
- }
-
- ts.mu.RLock()
- defer ts.mu.RUnlock()
-
- // Tasks that belong to the same process could potentially point to the
- // same FDTable. So we retain a map of processed ones to avoid
- // processing the same FDTable multiple times.
- processed := make(map[*FDTable]struct{})
- for t := range ts.Root.tids {
- // We can skip locking Task.mu here since the kernel is paused.
- if t.fdTable == nil {
- continue
- }
- if _, ok := processed[t.fdTable]; ok {
- continue
- }
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
- if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
- e.UnregisterEpollWaiters()
- }
- })
- processed[t.fdTable] = struct{}{}
- }
-}
-
// LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
loadStart := time.Now()
initAppCores := k.applicationCores
@@ -647,7 +662,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
// don't need to explicitly install it in the Kernel.
cpuidStart := time.Now()
var features cpuid.FeatureSet
- if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil {
+ if _, err := state.Load(ctx, r, &features); err != nil {
return err
}
log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -662,11 +677,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
// Load the kernel state.
kernelStart := time.Now()
- var stats state.Stats
- if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil {
+ stats, err := state.Load(ctx, r, k)
+ if err != nil {
return err
}
- log.Infof("Kernel load stats: %s", &stats)
+ log.Infof("Kernel load stats: %s", stats.String())
log.Infof("Kernel load took [%s].", time.Since(kernelStart))
// rootNetworkNamespace should be populated after loading the state file.
@@ -675,7 +690,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
// Load the memory file's state.
memoryStart := time.Now()
- if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
+ if err := k.mf.LoadFrom(ctx, r); err != nil {
return err
}
log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -687,11 +702,17 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
net.Resume()
}
- // Ensure that all pending asynchronous work is complete:
- // - namedpipe opening
- // - inode file opening
- if err := fs.AsyncErrorBarrier(); err != nil {
- return err
+ if VFS2Enabled {
+ if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
+ return err
+ }
+ } else {
+ // Ensure that all pending asynchronous work is complete:
+ // - namedpipe opening
+ // - inode file opening
+ if err := fs.AsyncErrorBarrier(); err != nil {
+ return err
+ }
}
tcpip.AsyncLoading.Wait()
@@ -820,7 +841,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.args.UTSNamespace
case CtxIPCNamespace:
- return ctx.args.IPCNamespace
+ ipcns := ctx.args.IPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
@@ -833,14 +856,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
if ctx.args.MountNamespaceVFS2 == nil {
return nil
}
- // MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
- return ctx.args.MountNamespaceVFS2.Root()
+ root := ctx.args.MountNamespaceVFS2.Root()
+ root.IncRef()
+ return root
case vfs.CtxMountNamespace:
if ctx.k.globalInit == nil {
return nil
}
- // MountNamespaceVFS2 takes a reference for us.
- return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns.IncRef()
+ return mntns
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case inet.CtxStack:
@@ -890,20 +915,20 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
opener fsbridge.Lookup
fsContext *FSContext
mntns *fs.MountNamespace
+ mntnsVFS2 *vfs.MountNamespace
)
if VFS2Enabled {
- mntnsVFS2 := args.MountNamespaceVFS2
+ mntnsVFS2 = args.MountNamespaceVFS2
if mntnsVFS2 == nil {
- // MountNamespaceVFS2 adds a reference to the namespace, which is
- // transferred to the new process.
- mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+ // Add a reference to the namespace, which is transferred to the new process.
+ mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
+ mntnsVFS2.IncRef()
}
// Get the root directory from the MountNamespace.
- root := args.MountNamespaceVFS2.Root()
- // The call to newFSContext below will take a reference on root, so we
- // don't need to hold this one.
- defer root.DecRef()
+ root := mntnsVFS2.Root()
+ root.IncRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
wd := root // Default.
@@ -921,7 +946,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
fsContext = NewFSContextVFS2(root, wd, args.Umask)
@@ -936,7 +961,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
root := mntns.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
remainingTraversals := args.MaxSymlinkTraversals
@@ -947,13 +972,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewFSLookup(mntns, root, wd)
fsContext = newFSContext(root, wd, args.Umask)
}
tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+ cu := cleanup.Make(func() {
+ tg.Release(ctx)
+ })
+ defer cu.Clean()
// Check which file to start from.
switch {
@@ -1010,16 +1039,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
- MountNamespaceVFS2: args.MountNamespaceVFS2,
+ MountNamespaceVFS2: mntnsVFS2,
ContainerID: args.ContainerID,
}
- t, err := k.tasks.NewTask(config)
+ t, err := k.tasks.NewTask(ctx, config)
if err != nil {
return nil, 0, err
}
t.traceExecEvent(tc) // Simulate exec for tracing.
// Success.
+ cu.Release()
tgid := k.tasks.Root.IDOfThreadGroup(tg)
if k.globalInit == nil {
k.globalInit = tg
@@ -1057,7 +1087,7 @@ func (k *Kernel) Start() error {
// If k was created by LoadKernelFrom, timers were stopped during
// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
// this is a no-op.
- k.resumeTimeLocked()
+ k.resumeTimeLocked(k.SupervisorContext())
// Start task goroutines.
k.tasks.mu.RLock()
defer k.tasks.mu.RUnlock()
@@ -1069,9 +1099,10 @@ func (k *Kernel) Start() error {
// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) pauseTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
// Kernel.Start().
if k.cpuClockTicker != nil {
@@ -1093,7 +1124,7 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.PauseTimer()
@@ -1113,9 +1144,10 @@ func (k *Kernel) pauseTimeLocked() {
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
-func (k *Kernel) resumeTimeLocked() {
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
+func (k *Kernel) resumeTimeLocked(ctx context.Context) {
if k.cpuClockTicker != nil {
k.cpuClockTicker.Resume()
}
@@ -1129,7 +1161,7 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.ResumeTimer()
@@ -1254,13 +1286,22 @@ func (k *Kernel) Kill(es ExitStatus) {
}
// Pause requests that all tasks in k temporarily stop executing, and blocks
-// until all tasks in k have stopped. Multiple calls to Pause nest and require
-// an equal number of calls to Unpause to resume execution.
+// until all tasks and asynchronous I/O operations in k have stopped. Multiple
+// calls to Pause nest and require an equal number of calls to Unpause to
+// resume execution.
func (k *Kernel) Pause() {
k.extMu.Lock()
k.tasks.BeginExternalStop()
k.extMu.Unlock()
k.tasks.runningGoroutines.Wait()
+ k.tasks.aioGoroutines.Wait()
+}
+
+// ReceiveTaskStates receives full states for all tasks.
+func (k *Kernel) ReceiveTaskStates() {
+ k.extMu.Lock()
+ k.tasks.PullFullState()
+ k.extMu.Unlock()
}
// Unpause ends the effect of a previous call to Pause. If Unpause is called
@@ -1353,8 +1394,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
return k.rootUTSNamespace
}
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ k.rootIPCNamespace.IncRef()
return k.rootIPCNamespace
}
@@ -1470,6 +1512,11 @@ func (k *Kernel) NowMonotonic() int64 {
return now
}
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+ return ktime.TcpipAfterFunc(k.realtimeClock, d, f)
+}
+
// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
// LoadFrom.
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
@@ -1494,20 +1541,27 @@ func (k *Kernel) SupervisorContext() context.Context {
}
}
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+ k *Kernel
+ Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+ SockVFS2 *vfs.FileDescription // Only used by VFS2.
+ ID uint64 // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
socketEntry
- k *Kernel
- Sock *refs.WeakRef
- SockVFS2 *vfs.FileDescription
- ID uint64 // Socket table entry number.
+ SocketRecord
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone() {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
s.k.extMu.Lock()
s.k.sockets.Remove(s)
s.k.extMu.Unlock()
@@ -1518,9 +1572,14 @@ func (s *SocketEntry) WeakRefGone() {
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{k: k, ID: id}
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecordVFS1{
+ SocketRecord: SocketRecord{
+ k: k,
+ ID: id,
+ },
+ }
s.Sock = refs.NewWeakRef(sock, s)
k.sockets.PushBack(s)
k.extMu.Unlock()
@@ -1532,29 +1591,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
// Precondition: Caller must hold a reference to sock.
//
// Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
k.extMu.Lock()
- id := k.nextSocketEntry
- k.nextSocketEntry++
- s := &SocketEntry{
+ if _, ok := k.socketsVFS2[sock]; ok {
+ panic(fmt.Sprintf("Socket %p added twice", sock))
+ }
+ id := k.nextSocketRecord
+ k.nextSocketRecord++
+ s := &SocketRecord{
k: k,
ID: id,
SockVFS2: sock,
}
- k.sockets.PushBack(s)
+ k.socketsVFS2[sock] = s
+ k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+ k.extMu.Lock()
+ delete(k.socketsVFS2, sock)
k.extMu.Unlock()
}
// ListSockets returns a snapshot of all sockets.
//
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
// to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
k.extMu.Lock()
- var socks []*SocketEntry
- for s := k.sockets.Front(); s != nil; s = s.Next() {
- socks = append(socks, s)
+ var socks []*SocketRecord
+ if VFS2Enabled {
+ for _, s := range k.socketsVFS2 {
+ socks = append(socks, s)
+ }
+ } else {
+ for s := k.sockets.Front(); s != nil; s = s.Next() {
+ socks = append(socks, &s.SocketRecord)
+ }
}
k.extMu.Unlock()
return socks
@@ -1582,7 +1657,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.k.rootUTSNamespace
case CtxIPCNamespace:
- return ctx.k.rootIPCNamespace
+ ipcns := ctx.k.rootIPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
@@ -1595,16 +1672,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
if ctx.k.globalInit == nil {
return vfs.VirtualDentry{}
}
- mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
- defer mntns.DecRef()
- // Root() takes a reference on the root dirent for us.
- return mntns.Root()
+ root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
+ root.IncRef()
+ return root
case vfs.CtxMountNamespace:
if ctx.k.globalInit == nil {
return nil
}
- // MountNamespaceVFS2() takes a reference for us.
- return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntns.IncRef()
+ return mntns
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case inet.CtxStack:
@@ -1685,3 +1762,20 @@ func (k *Kernel) ShmMount() *vfs.Mount {
func (k *Kernel) SocketMount() *vfs.Mount {
return k.socketMount
}
+
+// Release releases resources owned by k.
+//
+// Precondition: This should only be called after the kernel is fully
+// initialized, e.g. after k.Start() has been called.
+func (k *Kernel) Release() {
+ ctx := k.SupervisorContext()
+ if VFS2Enabled {
+ k.hostMount.DecRef(ctx)
+ k.pipeMount.DecRef(ctx)
+ k.shmMount.DecRef(ctx)
+ k.socketMount.DecRef(ctx)
+ k.vfs.Release(ctx)
+ }
+ k.timekeeper.Destroy()
+ k.vdso.Release(ctx)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 7bfa9075a..99134e634 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,11 +21,13 @@ go_library(
"//pkg/amutex",
"//pkg/buffer",
"//pkg/context",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4b688c627..6497dc4ba 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
if !waitFor(&i.mu, &i.wWakeup, ctx) {
- r.DecRef()
+ r.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
@@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
// On a nonblocking, write-only open, the open fails with ENXIO if the
// read side isn't open yet.
if flags.NonBlocking {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ENXIO
}
if !waitFor(&i.mu, &i.rWakeup, ctx) {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ab75a87ff..ce0db5583 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) {
f := NewInodeOperations(ctx, perms, newNamedPipe(t))
rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
- rFile.DecRef()
+ rFile.DecRef(ctx)
wDone := make(chan struct{})
// This open for write should block because the reader is now gone.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 79645d7d2..67beb0ad6 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,6 +17,7 @@ package pipe
import (
"fmt"
+ "io"
"sync/atomic"
"syscall"
@@ -152,7 +153,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.
d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
// The p.Open calls below will each take a reference on the Dirent. We
// must drop the one we already have.
- defer d.DecRef()
+ defer d.DecRef(ctx)
return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
}
@@ -200,22 +201,22 @@ type readOps struct {
//
// Precondition: this pipe must have readers.
func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
- // Don't block for a zero-length read even if the pipe is empty.
- if ops.left() == 0 {
- return 0, nil
- }
-
p.mu.Lock()
defer p.mu.Unlock()
return p.readLocked(ctx, ops)
}
func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+ // Don't block for a zero-length read even if the pipe is empty.
+ if ops.left() == 0 {
+ return 0, nil
+ }
+
// Is the pipe empty?
if p.view.Size() == 0 {
if !p.HasWriters() {
// There are no writers, return EOF.
- return 0, nil
+ return 0, io.EOF
}
return 0, syserror.ErrWouldBlock
}
@@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
func (p *Pipe) queued() int64 {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
return p.view.Size()
}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index bda739dbe..fe97e9800 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -27,8 +27,8 @@ import (
func TestPipeRW(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
wantN := int64(len(msg))
@@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) {
func TestPipeReadBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
if n != 0 || err != syserror.ErrWouldBlock {
@@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := make([]byte, capacity+1)
n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
@@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index aacf28da2..f665920cb 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -33,7 +34,7 @@ import (
// the old fs architecture.
// Release cleans up the pipe's state.
-func (p *Pipe) Release() {
+func (p *Pipe) Release(context.Context) {
p.rClose()
p.wClose()
@@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
v = math.MaxInt32 // Silently truncate.
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ iocc := primitive.IOCopyContext{
+ IO: io,
+ Ctx: ctx,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }
+ _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
return 0, err
default:
return 0, syscall.ENOTTY
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 7724b4452..ac18785c0 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Reader struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (r *Reader) Release() {
+func (r *Reader) Release(context.Context) {
r.Pipe.rClose()
// Wake up writers.
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 2602bed72..d96bf253b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -32,6 +33,8 @@ import (
// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
// not be copied.
+//
+// +stateify savable
type VFSPipe struct {
// mu protects the fields below.
mu sync.Mutex `state:"nosave"`
@@ -61,11 +64,18 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
//
// Preconditions: statusFlags should not contain an open access mode.
func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
- return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+ // Connected pipes share the same locks.
+ locks := &vfs.FileLocks{}
+ return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+ return syserror.ESPIPE
}
// Open opens the pipe represented by vp.
-func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
defer vp.mu.Unlock()
@@ -75,7 +85,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
return nil, syserror.EINVAL
}
- fd := vp.newFD(mnt, vfsd, statusFlags)
+ fd := vp.newFD(mnt, vfsd, statusFlags, locks)
// Named pipes have special blocking semantics during open:
//
@@ -98,7 +108,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// If this pipe is being opened as blocking and there's no
// writer, we have to wait for a writer to open the other end.
if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
@@ -109,12 +119,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// Non-blocking, write-only opens fail with ENXIO when the read
// side isn't open yet.
if statusFlags&linux.O_NONBLOCK != 0 {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.ENXIO
}
// Wait for a reader to open the other end.
if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
}
@@ -127,10 +137,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
}
// Preconditions: vp.mu must be held.
-func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription {
fd := &VFSPipeFD{
pipe: &vp.pipe,
}
+ fd.LockFD.Init(locks)
fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
DenyPRead: true,
DenyPWrite: true,
@@ -155,16 +166,19 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *
// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
// other FileDescriptions for splice(2) and tee(2).
+//
+// +stateify savable
type VFSPipeFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
pipe *Pipe
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *VFSPipeFD) Release() {
+func (fd *VFSPipeFD) Release(context.Context) {
var event waiter.EventMask
if fd.vfsfd.IsReadable() {
fd.pipe.rClose()
@@ -195,6 +209,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
}
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.ESPIPE
+}
+
// EventRegister implements waiter.Waitable.EventRegister.
func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.pipe.EventRegister(e, mask)
@@ -222,8 +241,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
// PipeSize implements fcntl(F_GETPIPE_SZ).
func (fd *VFSPipeFD) PipeSize() int64 {
- // Inline Pipe.FifoSize() rather than calling it with nil Context and
- // fs.File and ignoring the returned error (which is always nil).
+ // Inline Pipe.FifoSize() since we don't have a fs.File.
fd.pipe.mu.Lock()
defer fd.pipe.mu.Unlock()
return fd.pipe.max
@@ -234,19 +252,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
return fd.pipe.SetFifoSize(size)
}
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
- return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ // Cap the sequence at number of bytes actually available.
+ v := fd.pipe.queuedLocked()
+ if v < count {
+ count = v
+ }
+ src := usermem.IOSequence{
IO: fd,
Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
}
+
+ var (
+ n int64
+ err error
+ )
+ if off == -1 {
+ n, err = out.Write(ctx, src, vfs.WriteOptions{})
+ } else {
+ n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+ }
+ if n > 0 {
+ fd.pipe.view.TrimFront(n)
+ }
+ return n, err
}
-// CopyIn implements usermem.IO.CopyIn.
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+ fd.pipe.mu.Lock()
+ defer fd.pipe.mu.Unlock()
+
+ dst := usermem.IOSequence{
+ IO: fd,
+ Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ }
+
+ if off == -1 {
+ return in.Read(ctx, dst, vfs.ReadOptions{})
+ }
+ return in.PRead(ctx, dst, off, vfs.ReadOptions{})
+}
+
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(dst))
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return int64(len(dst))
},
@@ -255,7 +311,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadAt(dst, 0)
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -271,7 +326,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
// CopyOut implements usermem.IO.CopyOut.
func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
origCount := int64(len(src))
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return int64(len(src))
},
@@ -295,7 +350,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
// ZeroOut implements usermem.IO.ZeroOut.
func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
origCount := toZero
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return toZero
},
@@ -316,14 +371,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
return n, err
}
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
count := ars.NumBytes()
if count == 0 {
return 0, nil
}
origCount := count
- n, err := fd.pipe.read(ctx, readOps{
+ n, err := fd.pipe.readLocked(ctx, readOps{
left: func() int64 {
return count
},
@@ -332,7 +388,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
},
read: func(view *buffer.View) (int64, error) {
n, err := view.ReadToSafememWriter(dst, uint64(count))
- view.TrimFront(int64(n))
return int64(n), err
},
})
@@ -352,7 +407,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
return 0, nil
}
origCount := count
- n, err := fd.pipe.write(ctx, writeOps{
+ n, err := fd.pipe.writeLocked(ctx, writeOps{
left: func() int64 {
return count
},
@@ -446,3 +501,13 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr
}
return n, err
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 5bc6aa931..ef4b70ca3 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Writer struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (w *Writer) Release() {
+func (w *Writer) Release(context.Context) {
w.Pipe.wClose()
// Wake up readers.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e23e796ef..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
@@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool {
// beginPtraceStopLocked does not signal t's tracer or wake it if it is
// waiting.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) beginPtraceStopLocked() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
// ptraceStop, temporarily preventing it from being removed by a concurrent
// Task.Kill, and returns true. Otherwise it returns false.
//
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
func (t *Task) ptraceFreeze() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() {
t.ptraceUnfreezeLocked()
}
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
func (t *Task) ptraceUnfreezeLocked() {
// Do this even if the task has been killed to ensure a panic if t.stop is
// nil or not a ptraceStop.
@@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() {
// ptraceSignalLocked is called after signal dequeueing to check if t should
// enter ptrace signal-delivery-stop.
//
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
if linux.Signal(info.Signo) == linux.SIGKILL {
return false
@@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
return nil
}
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
const valid = uintptr(linux.PTRACE_O_EXITKILL |
linux.PTRACE_O_TRACESYSGOOD |
@@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
- IgnorePermissions: true,
- }); err != nil {
+ if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
- _, err := t.CopyOut(data, word)
+ _, err := word.CopyOut(t, data)
return err
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
- _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
- IgnorePermissions: true,
- })
+ word := t.Arch().Native(uintptr(data))
+ _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
@@ -1018,6 +1021,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
@@ -1044,10 +1050,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ mm := t.MemoryManager()
+ t.p.PullFullState(mm.AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
- IO: t.MemoryManager(),
+ IO: mm,
Addr: ar.Start,
Opts: usermem.IOOpts{
AddressSpaceActive: true,
@@ -1056,6 +1066,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+ t.p.FullStateChanged()
ar.End -= usermem.Addr(n)
return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
@@ -1065,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if target.ptraceSiginfo == nil {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.ptraceSiginfo)
+ _, err := target.ptraceSiginfo.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGINFO:
var info arch.SignalInfo
- if _, err := t.CopyIn(data, &info); err != nil {
+ if _, err := info.CopyIn(t, data); err != nil {
return err
}
t.tg.pidns.owner.mu.RLock()
@@ -1085,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if addr != linux.SignalSetSize {
return syserror.EINVAL
}
- _, err := t.CopyOut(data, target.SignalMask())
+ mask := target.SignalMask()
+ _, err := mask.CopyOut(t, data)
return err
case linux.PTRACE_SETSIGMASK:
@@ -1093,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
return syserror.EINVAL
}
var mask linux.SignalSet
- if _, err := t.CopyIn(data, &mask); err != nil {
+ if _, err := mask.CopyIn(t, data); err != nil {
return err
}
// The target's task goroutine is stopped, so this is safe:
@@ -1108,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_GETEVENTMSG:
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
- _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+ _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
return err
// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
if err != nil {
return err
}
- _, err = t.CopyOut(data, n)
+ _, err = n.CopyOut(t, data)
return err
case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
// t's CPU number.
//
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
t.oldRSeqCPUAddr = addr
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
return nil
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqUpdateCPU() error {
if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
return oerr
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) oldRSeqCopyOutCPU() error {
if t.oldRSeqCPUAddr == 0 {
return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqCopyOutCPU() error {
if t.rseqAddr == 0 {
return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
return err
}
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqClearCPU() error {
buf := t.CopyScratchBuffer(8)
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
//
// See kernel/rseq.c:rseq_ip_fixup for reference.
//
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) rseqAddrInterrupt() {
if t.rseqAddr == 0 {
return
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index c38c5a40c..387edfa91 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,7 +18,6 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
@@ -27,25 +26,18 @@ import (
const maxSyscallFilterInstructions = 1 << 15
-// seccompData is equivalent to struct seccomp_data, which contains the data
-// passed to seccomp-bpf filters.
-type seccompData struct {
- // nr is the system call number.
- nr int32
-
- // arch is an AUDIT_ARCH_* value indicating the system call convention.
- arch uint32
-
- // instructionPointer is the value of the instruction pointer at the time
- // of the system call.
- instructionPointer uint64
-
- // args contains the first 6 system call arguments.
- args [6]uint64
-}
-
-func (d *seccompData) asBPFInput() bpf.Input {
- return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+// dataAsBPFInput returns a serialized BPF program, only valid on the current task
+// goroutine.
+//
+// Note: this is called for every syscall, which is a very hot path.
+func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
+ buf := t.CopyScratchBuffer(d.SizeBytes())
+ d.MarshalUnsafe(buf)
+ return bpf.InputBytes{
+ Data: buf,
+ // Go-marshal always uses the native byte order.
+ Order: usermem.ByteOrder,
+ }
}
func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
@@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
}
func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
- data := seccompData{
- nr: sysno,
- arch: t.tc.st.AuditNumber,
- instructionPointer: uint64(ip),
+ data := linux.SeccompData{
+ Nr: sysno,
+ Arch: t.tc.st.AuditNumber,
+ InstructionPointer: uint64(ip),
}
// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
// we can't do any slicing tricks or even use copy/append here.
for i, arg := range args {
- if i >= len(data.args) {
+ if i >= len(data.Args) {
break
}
- data.args[i] = arg.Uint64()
+ data.Args[i] = arg.Uint64()
}
- input := data.asBPFInput()
+ input := dataAsBPFInput(t, &data)
ret := uint32(linux.SECCOMP_RET_ALLOW)
f := t.syscallFilters.Load()
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c00fa1138..c39ecfb8f 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -283,6 +283,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
return nil
}
+// GetStat extracts semid_ds information from the set.
+func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // "The calling process must have read permission on the semaphore set."
+ if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+ return nil, syserror.EACCES
+ }
+
+ ds := &linux.SemidDS{
+ SemPerm: linux.IPCPerm{
+ Key: uint32(s.key),
+ UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+ GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+ CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+ CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+ Mode: uint16(s.perms.LinuxMode()),
+ Seq: 0, // IPC sequence not supported.
+ },
+ SemOTime: s.opTime.TimeT(),
+ SemCTime: s.changeTime.TimeT(),
+ SemNSems: uint64(s.Size()),
+ }
+ return ds, nil
+}
+
// SetVal overrides a semaphore value, waking up waiters as needed.
func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
if val < 0 || val > valueMax {
@@ -320,7 +347,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
}
for _, val := range vals {
- if val < 0 || val > valueMax {
+ if val > valueMax {
return syserror.ERANGE
}
}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 0e19286de..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,7 +16,6 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,7 +30,7 @@ type ProcessGroupID ThreadID
//
// +stateify savable
type Session struct {
- refs refs.AtomicRefCount
+ SessionRefs
// leader is the originator of the Session.
//
@@ -61,16 +60,11 @@ type Session struct {
sessionEntry
}
-// incRef grabs a reference.
-func (s *Session) incRef() {
- s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
//
// Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
- s.refs.DecRefWithDestructor(func() {
+func (s *Session) DecRef() {
+ s.SessionRefs.DecRef(func() {
// Remove translations from the leader.
for ns := s.leader.pidns; ns != nil; ns = ns.parent {
id := ns.sids[s]
@@ -87,7 +81,7 @@ func (s *Session) decRef() {
//
// +stateify savable
type ProcessGroup struct {
- refs refs.AtomicRefCount // not exported.
+ refs ProcessGroupRefs
// originator is the originator of the group.
//
@@ -162,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
}
alive := true
- pg.refs.DecRefWithDestructor(func() {
+ pg.refs.DecRef(func() {
alive = false // don't bother with handleOrphan.
// Remove translations from the originator.
@@ -174,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
// Remove the list of process groups.
pg.session.processGroups.Remove(pg)
- pg.session.decRef()
+ pg.session.DecRef()
})
if alive {
pg.handleOrphan()
@@ -301,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
- s.refs.EnableLeakCheck("kernel.Session")
+ s.EnableLeakCheck()
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -315,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -395,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
//
// We manually adjust the ancestors if the parent is in the same
// session.
- tg.processGroup.session.incRef()
+ tg.processGroup.session.IncRef()
pg := ProcessGroup{
id: ProcessGroupID(id),
originator: tg,
session: tg.processGroup.session,
}
- pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+ pg.refs.EnableLeakCheck()
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index bfd779837..80a592c8f 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
+go_template_instance(
+ name = "shm_refs",
+ out = "shm_refs.go",
+ package = "shm",
+ prefix = "Shm",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "Shm",
+ },
+)
+
go_library(
name = "shm",
srcs = [
"device.go",
"shm.go",
+ "shm_refs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
@@ -14,13 +27,13 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
- "//pkg/sentry/platform",
"//pkg/sentry/usage",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index f66cfcc7f..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,13 +39,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -253,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
- shm.EnableLeakCheck("kernel.Shm")
+ shm.EnableLeakCheck()
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -323,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
r.totalPages -= s.effectiveSize / usermem.PageSize
}
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+ // Because Shm.DecRef() may acquire the same locks, collect the segments to
+ // release first. Note that this should not race with any updates to r, since
+ // the IPC namespace containing it has no more references.
+ toRelease := make([]*Shm, 0)
+ r.mu.Lock()
+ for _, s := range r.keysToShms {
+ s.mu.Lock()
+ if !s.pendingDestruction {
+ toRelease = append(toRelease, s)
+ }
+ s.mu.Unlock()
+ }
+ r.mu.Unlock()
+
+ for _, s := range toRelease {
+ r.dissociateKey(s)
+ s.DecRef(ctx)
+ }
+}
+
// Shm represents a single shared memory segment.
//
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
// Segments are always mapped as a whole, greatly simplifying how mappings are
// tracked. However note that mremap and munmap calls may cause the vma for a
// segment to become fragmented; which requires special care when unmapping a
@@ -338,14 +359,14 @@ func (r *Registry) remove(s *Shm) {
//
// +stateify savable
type Shm struct {
- // AtomicRefCount tracks the number of references to this segment.
+ // ShmRefs tracks the number of references to this segment.
//
// A segment holds a reference to itself until it is marked for
// destruction.
//
// In addition to direct users, the MemoryManager will hold references
// via MappingIdentity.
- refs.AtomicRefCount
+ ShmRefs
mfp pgalloc.MemoryFileProvider
@@ -370,7 +391,7 @@ type Shm struct {
// fr is the offset into mfp.MemoryFile() that backs this contents of this
// segment. Immutable.
- fr platform.FileRange
+ fr memmap.FileRange
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
@@ -429,11 +450,14 @@ func (s *Shm) InodeID() uint64 {
return uint64(s.ID)
}
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
//
// Precondition: Caller must not hold s.mu.
-func (s *Shm) DecRef() {
- s.DecRefWithDestructor(s.destroy)
+func (s *Shm) DecRef(ctx context.Context) {
+ s.ShmRefs.DecRef(func() {
+ s.mfp.MemoryFile().DecRef(s.fr)
+ s.registry.remove(s)
+ })
}
// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -643,30 +667,28 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
return nil
}
-func (s *Shm) destroy() {
- s.mfp.MemoryFile().DecRef(s.fr)
- s.registry.remove(s)
-}
-
// MarkDestroyed marks a segment for destruction. The segment is actually
// destroyed once it has no references. MarkDestroyed may be called multiple
// times, and is safe to call after a segment has already been destroyed. See
// shmctl(IPC_RMID).
-func (s *Shm) MarkDestroyed() {
+func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
- defer s.mu.Unlock()
- if !s.pendingDestruction {
- s.pendingDestruction = true
- // Drop the self-reference so destruction occurs when all
- // external references are gone.
- //
- // N.B. This cannot be the final DecRef, as the caller also
- // holds a reference.
- s.DecRef()
+ if s.pendingDestruction {
+ s.mu.Unlock()
return
}
+ s.pendingDestruction = true
+ s.mu.Unlock()
+
+ // Drop the self-reference so destruction occurs when all
+ // external references are gone.
+ //
+ // N.B. This cannot be the final DecRef, as the caller also
+ // holds a reference.
+ s.DecRef(ctx)
+ return
}
// checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 3eb78e91b..76d472292 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 8243bb93e..78f718cfe 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -17,7 +17,6 @@ package signalfd
import (
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
@@ -76,7 +75,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
}
// Release implements fs.FileOperations.Release.
-func (s *SignalOperations) Release() {}
+func (s *SignalOperations) Release(context.Context) {}
// Mask returns the signal mask.
func (s *SignalOperations) Mask() linux.SignalSet {
@@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
}
// Copy out the signal info using the specified format.
- var buf [128]byte
- binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+ infoNative := linux.SignalfdSiginfo{
Signo: uint32(info.Signo),
Errno: info.Errno,
Code: info.Code,
@@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
Status: info.Status(),
Overrun: uint32(info.Overrun()),
Addr: info.Addr(),
- })
- n, err := dst.CopyOut(ctx, buf[:])
- return int64(n), err
+ }
+ n, err := infoNative.WriteTo(dst.Writer(ctx))
+ if err == usermem.ErrEndOfIOSequence {
+ // Partial copy-out ok.
+ err = nil
+ }
+ return n, err
}
// Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
return fmt.Sprintf("sys_%d", sysno) // Unlikely.
}
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+ for i, syscall := range s.Table {
+ if syscall.Name == name {
+ return uintptr(i), nil
+ }
+ }
+ return 0, fmt.Errorf("syscall %q not found", name)
+}
+
// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 4607cde2f..a83ce219c 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -98,6 +98,15 @@ func (s *syslog) Log() []byte {
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
}
+ if VFS2Enabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...)
+ if FUSEEnabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...)
+ }
+ }
+
time += rand.Float64() / 2
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f48247c94..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -68,6 +68,21 @@ type Task struct {
// runState is exclusive to the task goroutine.
runState taskRunState
+ // taskWorkCount represents the current size of the task work queue. It is
+ // used to avoid acquiring taskWorkMu when the queue is empty.
+ //
+ // Must accessed with atomic memory operations.
+ taskWorkCount int32
+
+ // taskWorkMu protects taskWork.
+ taskWorkMu sync.Mutex `state:"nosave"`
+
+ // taskWork is a queue of work to be executed before resuming user execution.
+ // It is similar to the task_work mechanism in Linux.
+ //
+ // taskWork is exclusive to the task goroutine.
+ taskWork []TaskWorker
+
// haveSyscallReturn is true if tc.Arch().Return() represents a value
// returned by a syscall (or set by ptrace after a syscall).
//
@@ -550,11 +565,20 @@ type Task struct {
// futexWaiter is exclusive to the task goroutine.
futexWaiter *futex.Waiter `state:"nosave"`
+ // robustList is a pointer to the head of the tasks's robust futex
+ // list.
+ robustList usermem.Addr
+
// startTime is the real time at which the task started. It is set when
// a Task is created or invokes execve(2).
//
// startTime is protected by mu.
startTime ktime.Time
+
+ // kcov is the kcov instance providing code coverage owned by this task.
+ //
+ // kcov is exclusive to the task goroutine.
+ kcov *Kcov
}
func (t *Task) savePtraceTracer() *Task {
@@ -632,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return t.utsns
case CtxIPCNamespace:
- return t.ipcns
+ ipcns := t.IPCNamespace()
+ ipcns.IncRef()
+ return ipcns
case CtxTask:
return t
case auth.CtxCredentials:
@@ -711,17 +737,16 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
if VFS2Enabled {
realRoot := t.mountNamespaceVFS2.Root()
- defer realRoot.DecRef()
root := t.fsContext.RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
return root != realRoot
}
realRoot := t.tg.mounts.Root()
- defer realRoot.DecRef()
+ defer realRoot.DecRef(t)
root := t.fsContext.RootDirectory()
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(t)
}
return root != realRoot
}
@@ -844,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
t.mu.Lock()
defer t.mu.Unlock()
- t.mountNamespaceVFS2.IncRef()
return t.mountNamespaceVFS2
}
@@ -884,3 +908,16 @@ func (t *Task) UID() uint32 {
func (t *Task) GID() uint32 {
return uint32(t.Credentials().EffectiveKGID)
}
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+ t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+ if t.kcov != nil {
+ t.kcov.OnTaskExit()
+ t.kcov = nil
+ }
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index e1ecca99e..682080c14 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -161,6 +162,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
return 0, nil, syserror.EINVAL
}
+ // Pull task registers and FPU state, a cloned task will inherit the
+ // state of the current task.
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
// single clone(2) or unshare(2) call, the user namespace is guaranteed to
// be created first, giving the child (clone(2)) or caller (unshare(2))
@@ -199,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
ipcns = NewIPCNamespace(userns)
+ } else {
+ ipcns.IncRef()
}
+ cu := cleanup.Make(func() {
+ ipcns.DecRef(t)
+ })
+ defer cu.Clean()
netns := t.NetworkNamespace()
if opts.NewNetworkNamespace {
@@ -210,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
mntnsVFS2 := t.mountNamespaceVFS2
if mntnsVFS2 != nil {
mntnsVFS2.IncRef()
+ cu.Add(func() {
+ mntnsVFS2.DecRef(t)
+ })
}
tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
return 0, nil, err
}
+ cu.Add(func() {
+ tc.release()
+ })
// clone() returns 0 in the child.
tc.Arch.SetReturn(0)
if opts.Stack != 0 {
@@ -237,7 +254,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
var fdTable *FDTable
if opts.NewFiles {
- fdTable = t.fdTable.Fork()
+ fdTable = t.fdTable.Fork(t)
} else {
fdTable = t.fdTable
fdTable.IncRef()
@@ -291,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
} else {
cfg.InheritParent = t
}
- nt, err := t.tg.pidns.owner.NewTask(cfg)
+ nt, err := t.tg.pidns.owner.NewTask(t, cfg)
+ // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
+ // the cleanup for us.
+ cu.Release()
if err != nil {
- if opts.NewThreadGroup {
- tg.release()
- }
return 0, nil, err
}
@@ -337,12 +354,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt.SetClearTID(opts.ChildTID)
}
if opts.ChildSetTID {
- // Can't use Task.CopyOut, which assumes AddressSpaceActive.
- usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+ ctid := nt.ThreadID()
+ ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
- t.CopyOut(opts.ParentTID, ntid)
+ ntid.CopyOut(t, opts.ParentTID)
}
kind := ptraceCloneKindClone
@@ -505,12 +522,13 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
+ t.ipcns.DecRef(t)
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
var oldFDTable *FDTable
if opts.NewFiles {
oldFDTable = t.fdTable
- t.fdTable = oldFDTable.Fork()
+ t.fdTable = oldFDTable.Fork(t)
}
var oldFSContext *FSContext
if opts.NewFSContext {
@@ -519,10 +537,10 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
t.mu.Unlock()
if oldFDTable != nil {
- oldFDTable.DecRef()
+ oldFDTable.DecRef(t)
}
if oldFSContext != nil {
- oldFSContext.DecRef()
+ oldFSContext.DecRef(t)
}
return nil
}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9fa528384..d1136461a 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Stack() *arch.Stack {
- return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+ return &arch.Stack{
+ Arch: t.Arch(),
+ IO: t.MemoryManager(),
+ Bottom: usermem.Addr(t.Arch().Stack()),
+ }
}
// LoadTaskImage loads a specified file into a new TaskContext.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 00c425cca..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -198,11 +198,18 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
t.tg.pidns.owner.mu.Unlock()
+ oldFDTable := t.fdTable
+ t.fdTable = t.fdTable.Fork(t)
+ oldFDTable.DecRef(t)
+
// Remove FDs with the CloseOnExec flag set.
- t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
+ t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
return flags.CloseOnExec
})
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// NOTE(b/30815691): We currently do not implement privileged
// executables (set-user/group-ID bits and file capabilities). This
// allows us to unconditionally enable user dumpability on the new mm.
@@ -219,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tc = *r.tc
t.mu.Unlock()
t.unstopVforkParent()
+ t.p.FullStateChanged()
// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
t.MemoryManager().Activate(t)
@@ -229,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
// promoteLocked makes t the leader of its thread group. If t is already the
// thread group leader, promoteLocked is a no-op.
//
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+// is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
func (t *Task) promoteLocked() {
oldLeader := t.tg.leader
if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c4ade6e8e..ce7b9641d 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.traceExitEvent()
lastExiter := t.exitThreadGroup()
+ t.ResetKcov()
+
// If the task has a cleartid, and the thread group wasn't killed by a
// signal, handle that before releasing the MM.
if t.cleartid != 0 {
@@ -246,13 +248,17 @@ func (*runExitMain) execute(t *Task) taskRunState {
signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
t.tg.signalHandlers.mu.Unlock()
if !signaled {
- if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ zero := ThreadID(0)
+ if _, err := zero.CopyOut(t, t.cleartid); err == nil {
t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
}
// If the CopyOut fails, there's nothing we can do.
}
}
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// Deactivate the address space and update max RSS before releasing the
// task's MM.
t.Deactivate()
@@ -266,20 +272,21 @@ func (*runExitMain) execute(t *Task) taskRunState {
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
t.unstopVforkParent()
- t.fsContext.DecRef()
- t.fdTable.DecRef()
+ t.fsContext.DecRef(t)
+ t.fdTable.DecRef(t)
t.mu.Lock()
if t.mountNamespaceVFS2 != nil {
- t.mountNamespaceVFS2.DecRef()
+ t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
+ t.ipcns.DecRef(t)
t.mu.Unlock()
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
- t.tg.release()
+ t.tg.Release(t)
}
// Detach tracees.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index a53e77c9f..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -15,6 +15,8 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -52,3 +54,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
return t.MemoryManager().GetSharedFutexKey(t, addr)
}
+
+// GetRobustList sets the robust futex list for the task.
+func (t *Task) GetRobustList() usermem.Addr {
+ t.mu.Lock()
+ addr := t.robustList
+ t.mu.Unlock()
+ return addr
+}
+
+// SetRobustList sets the robust futex list for the task.
+func (t *Task) SetRobustList(addr usermem.Addr) {
+ t.mu.Lock()
+ t.robustList = addr
+ t.mu.Unlock()
+}
+
+// exitRobustList walks the robust futex list, marking locks dead and notifying
+// wakers. It corresponds to Linux's exit_robust_list(). Following Linux,
+// errors are silently ignored.
+func (t *Task) exitRobustList() {
+ t.mu.Lock()
+ addr := t.robustList
+ t.robustList = 0
+ t.mu.Unlock()
+
+ if addr == 0 {
+ return
+ }
+
+ var rl linux.RobustListHead
+ if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+ return
+ }
+
+ next := primitive.Uint64(rl.List)
+ done := 0
+ var pendingLockAddr usermem.Addr
+ if rl.ListOpPending != 0 {
+ pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+ }
+
+ // Wake up normal elements.
+ for usermem.Addr(next) != addr {
+ // We traverse to the next element of the list before we
+ // actually wake anything. This prevents the race where waking
+ // this futex causes a modification of the list.
+ thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
+
+ // Try to decode the next element in the list before waking the
+ // current futex. But don't check the error until after we've
+ // woken the current futex. Linux does it in this order too
+ _, nextErr := next.CopyIn(t, usermem.Addr(next))
+
+ // Wakeup the current futex if it's not pending.
+ if thisLockAddr != pendingLockAddr {
+ t.wakeRobustListOne(thisLockAddr)
+ }
+
+ // If there was an error copying the next futex, we must bail.
+ if nextErr != nil {
+ break
+ }
+
+ // This is a user structure, so it could be a massive list, or
+ // even contain a loop if they are trying to mess with us. We
+ // cap traversal to prevent that.
+ done++
+ if done >= linux.ROBUST_LIST_LIMIT {
+ break
+ }
+ }
+
+ // Is there a pending entry to wake?
+ if pendingLockAddr != 0 {
+ t.wakeRobustListOne(pendingLockAddr)
+ }
+}
+
+// wakeRobustListOne wakes a single futex from the robust list.
+func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+ // Bit 0 in address signals PI futex.
+ pi := addr&1 == 1
+ addr = addr &^ 1
+
+ // Load the futex.
+ f, err := t.LoadUint32(addr)
+ if err != nil {
+ // Can't read this single value? Ignore the problem.
+ // We can wake the other futexes in the list.
+ return
+ }
+
+ tid := uint32(t.ThreadID())
+ for {
+ // Is this held by someone else?
+ if f&linux.FUTEX_TID_MASK != tid {
+ return
+ }
+
+ // This thread is dying and it's holding this futex. We need to
+ // set the owner died bit and wake up any waiters.
+ newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED
+ if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil {
+ return
+ } else if curF != f {
+ // Futex changed out from under us. Try again...
+ f = curF
+ continue
+ }
+
+ // Wake waiters if there are any.
+ if f&linux.FUTEX_WAITERS != 0 {
+ private := f&linux.FUTEX_PRIVATE_FLAG != 0
+ if pi {
+ t.Futex().UnlockPI(t, addr, tid, private)
+ return
+ }
+ t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1)
+ }
+
+ // Done.
+ return
+ }
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index eeccaa197..d23cea802 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -27,6 +27,9 @@ const (
// maxStackDebugBytes is the maximum number of user stack bytes that may be
// printed by debugDumpStack.
maxStackDebugBytes = 1024
+ // maxCodeDebugBytes is the maximum number of user code bytes that may be
+ // printed by debugDumpCode.
+ maxCodeDebugBytes = 128
)
// Infof logs an formatted info message by calling log.Infof.
@@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool {
func (t *Task) DebugDumpState() {
t.debugDumpRegisters()
t.debugDumpStack()
+ t.debugDumpCode()
if mm := t.MemoryManager(); mm != nil {
t.Debugf("Mappings:\n%s", mm)
}
@@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() {
}
}
+// debugDumpCode logs user code contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpCode() {
+ if !t.IsLogging(log.Debug) {
+ return
+ }
+ m := t.MemoryManager()
+ if m == nil {
+ t.Debugf("Memory manager for task is gone, skipping application code dump.")
+ return
+ }
+ t.Debugf("Code:")
+ // Print code on both sides of the instruction register.
+ start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
+ // Round addr down to a 16-byte boundary.
+ start &= ^usermem.Addr(15)
+ // Print 16 bytes per line, one byte at a time.
+ for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
+ addr, ok := start.AddLength(offset)
+ if !ok {
+ break
+ }
+ var data [16]byte
+ n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ // Print as much of the line as we can, even if an error was
+ // encountered.
+ if n > 0 {
+ t.Debugf("%x: % x", addr, data[:n])
+ }
+ if err != nil {
+ t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ break
+ }
+ }
+}
+
// trace definitions.
//
// Note that all region names are prefixed by ':' in order to ensure that they
@@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
return
}
- defer file.DecRef()
+ defer file.DecRef(t)
trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d654dd997..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,6 +26,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -140,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
- _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+ _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
@@ -167,15 +168,30 @@ func (app *runApp) execute(t *Task) taskRunState {
return (*runInterrupt)(nil)
}
- // We're about to switch to the application again. If there's still a
+ // Execute any task work callbacks before returning to user space.
+ if atomic.LoadInt32(&t.taskWorkCount) > 0 {
+ t.taskWorkMu.Lock()
+ queue := t.taskWork
+ t.taskWork = nil
+ atomic.StoreInt32(&t.taskWorkCount, 0)
+ t.taskWorkMu.Unlock()
+
+ // Do not hold taskWorkMu while executing task work, which may register
+ // more work.
+ for _, work := range queue {
+ work.TaskWork(t)
+ }
+ }
+
+ // We're about to switch to the application again. If there's still an
// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
// restart the syscall that was interrupted. If there's a saved signal
// mask, restore it. (Note that restoring the saved signal mask may unblock
// a pending signal, causing another interruption, but that signal should
// not interact with the interrupted syscall.)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
- if sre == ERESTART_RESTARTBLOCK {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre == syserror.ERESTART_RESTARTBLOCK {
t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
t.Arch().RestartSyscallWithRestartBlock()
} else {
@@ -245,7 +261,7 @@ func (app *runApp) execute(t *Task) taskRunState {
region := trace.StartRegion(t.traceContext, runRegion)
t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
- info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+ info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
region.End()
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
}
}
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
if state != TaskGoroutineRunningApp {
// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
}
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
stats := tg.exitedCPUStats
// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 79766cafe..ebdb83061 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
sigact := computeAction(linux.Signal(info.Signo), act)
if t.haveSyscallReturn {
- if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
// Signals that are ignored, cause a thread group stop, or
// terminate the thread group do not interact with interrupted
// syscalls; in Linux terms, they are never returned to the signal
@@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
// signal that is actually handled (by userspace).
if sigact == SignalActionHandler {
switch {
- case sre == ERESTARTNOHAND:
+ case sre == syserror.ERESTARTNOHAND:
fallthrough
- case sre == ERESTART_RESTARTBLOCK:
+ case sre == syserror.ERESTART_RESTARTBLOCK:
fallthrough
- case (sre == ERESTARTSYS && !act.IsRestart()):
+ case (sre == syserror.ERESTARTSYS && !act.IsRestart()):
t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
default:
@@ -255,10 +255,15 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
}
}
+ mm := t.MemoryManager()
// Set up the signal handler. If we have a saved signal mask, the signal
// handler should run with the current mask, but sigreturn should restore
// the saved one.
- st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+ st := &arch.Stack{
+ Arch: t.Arch(),
+ IO: mm,
+ Bottom: sp,
+ }
mask := t.signalMask
if t.haveSavedSignalMask {
mask = t.savedSignalMask
@@ -273,12 +278,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
// Please see the linux code as reference:
// linux/arch/arm64/kernel/signal.c:setup_return()
if act.Flags&linux.SA_RESTORER == 0 {
- act.Restorer = t.MemoryManager().VDSOSigReturn()
+ act.Restorer = mm.VDSOSigReturn()
}
if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
return err
}
+ t.p.FullStateChanged()
t.haveSavedSignalMask = false
// Add our signal mask.
@@ -310,14 +316,16 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
t.SetSignalMask(sigset &^ UnblockableSignals)
+ t.p.FullStateChanged()
return ctrlResume, nil
}
// Sigtimedwait implements the semantics of sigtimedwait(2).
//
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
// set is the set of signals we're interested in; invert it to get the set
// of signals to block.
@@ -581,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet {
// SetSignalMask sets t's signal mask.
//
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
// By precondition, t prevents t.tg from completing an execve and mutating
// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -628,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
t.savedSignalMask = mask
t.haveSavedSignalMask = true
@@ -636,6 +645,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() arch.SignalStack {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
alt := t.signalStack
if t.onSignalStack(alt) {
alt.Flags |= arch.SignalStackFlagOnStack
@@ -1050,6 +1060,8 @@ func (*runInterrupt) execute(t *Task) taskRunState {
// Are there signals pending?
if info := t.dequeueSignalLocked(t.signalMask); info != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
// Indicate that we've dequeued a stop signal before unlocking the
// signal mutex; initiateGroupStop will check for races with
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 8485fb4b6..8e28230cc 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -98,14 +99,18 @@ type TaskConfig struct {
// NewTask creates a new task defined by cfg.
//
// NewTask does not start the returned task; the caller must call Task.Start.
-func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+//
+// If successful, NewTask transfers references held by cfg to the new task.
+// Otherwise, NewTask releases them.
+func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
cfg.TaskContext.release()
- cfg.FSContext.DecRef()
- cfg.FDTable.DecRef()
+ cfg.FSContext.DecRef(ctx)
+ cfg.FDTable.DecRef(ctx)
+ cfg.IPCNamespace.DecRef(ctx)
if cfg.MountNamespaceVFS2 != nil {
- cfg.MountNamespaceVFS2.DecRef()
+ cfg.MountNamespaceVFS2.DecRef(ctx)
}
return nil, err
}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 10c6e455c..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
// beginInternalStop indicates the start of an internal stop that applies to t.
//
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
func (t *Task) beginInternalStop(s TaskStop) {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
t.beginInternalStopLocked(s)
}
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
func (t *Task) beginInternalStopLocked(s TaskStop) {
if t.stop != nil {
panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
// t.stop, which is why there is no endInternalStop that locks the signal mutex
// for you.
//
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
func (t *Task) endInternalStopLocked() {
if t.stop == nil {
panic("Attempting to leave non-existent internal stop")
@@ -205,6 +207,22 @@ func (ts *TaskSet) BeginExternalStop() {
}
}
+// PullFullState receives full states for all tasks.
+func (ts *TaskSet) PullFullState() {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ if ts.Root == nil {
+ return
+ }
+ for t := range ts.Root.tids {
+ t.Activate()
+ if mm := t.MemoryManager(); mm != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+ }
+ t.Deactivate()
+ }
+}
+
// EndExternalStop indicates the end of an external stop started by a previous
// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
// goroutines to resume.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index a5903b0b5..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -29,75 +30,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
-// include/linux/errno.h. These errnos are never returned to userspace
-// directly, but are used to communicate the expected behavior of an
-// interrupted syscall from the syscall to signal handling.
-type SyscallRestartErrno int
-
-// These numeric values are significant because ptrace syscall exit tracing can
-// observe them.
-//
-// For all of the following errnos, if the syscall is not interrupted by a
-// signal delivered to a user handler, the syscall is restarted.
-const (
- // ERESTARTSYS is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler without SA_RESTART set, and restarted otherwise.
- ERESTARTSYS = SyscallRestartErrno(512)
-
- // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
- // should always be restarted.
- ERESTARTNOINTR = SyscallRestartErrno(513)
-
- // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
- // should be converted to EINTR if interrupted by a signal delivered to a
- // user handler, and restarted otherwise.
- ERESTARTNOHAND = SyscallRestartErrno(514)
-
- // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
- // that it should be restarted using a custom function. The interrupted
- // syscall must register a custom restart function by calling
- // Task.SetRestartSyscallFn.
- ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
-)
-
var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
-// Error implements error.Error.
-func (e SyscallRestartErrno) Error() string {
- // Descriptions are borrowed from strace.
- switch e {
- case ERESTARTSYS:
- return "to be restarted if SA_RESTART is set"
- case ERESTARTNOINTR:
- return "to be restarted"
- case ERESTARTNOHAND:
- return "to be restarted if no handler"
- case ERESTART_RESTARTBLOCK:
- return "interrupted by signal"
- default:
- return "(unknown interrupt error)"
- }
-}
-
-// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
-// rv, the value in a syscall return register.
-func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
- switch int(rv) {
- case -int(ERESTARTSYS):
- return ERESTARTSYS, true
- case -int(ERESTARTNOINTR):
- return ERESTARTNOINTR, true
- case -int(ERESTARTNOHAND):
- return ERESTARTNOHAND, true
- case -int(ERESTART_RESTARTBLOCK):
- return ERESTART_RESTARTBLOCK, true
- default:
- return 0, false
- }
-}
-
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
@@ -354,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// Grab the caller up front, to make sure there's a sensible stack.
caller := t.Arch().Native(uintptr(0))
- if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -390,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
type runVsyscallAfterPtraceEventSeccomp struct {
addr usermem.Addr
sysno uintptr
- caller interface{}
+ caller marshal.Marshallable
}
func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -413,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
rval, ctrl, err := t.executeSyscall(sysno, args)
if ctrl != nil {
t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
@@ -447,7 +381,7 @@ func ExtractErrno(err error, sysno int) int {
return 0
case syscall.Errno:
return int(err)
- case SyscallRestartErrno:
+ case syserror.SyscallRestartErrno:
return int(err)
case *memmap.BusError:
// Bus errors may generate SIGBUS, but for syscalls they still
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
}
}
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
- return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyInBytes is a fast version of CopyIn if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
})
}
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
- return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-}
-
// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
// data without reflection and pass in a byte slice.
//
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
var v []string
for {
argAddr := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, argAddr); err != nil {
+ if _, err := argAddr.CopyIn(t, addr); err != nil {
return v, err
}
if t.Arch().Value(argAddr) == 0 {
@@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
// memory mapped at addr.
//
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
switch t.Arch().Width() {
case 8:
@@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
// combined length of all AddrRanges would otherwise exceed this amount, ranges
// beyond MAX_RW_COUNT are silently truncated.
//
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
if numIovecs == 0 {
return usermem.AddrRangeSeq{}, nil
@@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
//
// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
//
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
return usermem.IOSequence{}, syserror.EINVAL
@@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
Opts: opts,
}, nil
}
+
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
+ *Task
+ opts usermem.IOOpts
+}
+
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+ return &copyContext{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+ return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go
new file mode 100644
index 000000000..dda5a433a
--- /dev/null
+++ b/pkg/sentry/kernel/task_work.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync/atomic"
+
+// TaskWorker is a deferred task.
+//
+// This must be savable.
+type TaskWorker interface {
+ // TaskWork will be executed prior to returning to user space. Note that
+ // TaskWork may call RegisterWork again, but this will not be executed until
+ // the next return to user space, unlike in Linux. This effectively allows
+ // registration of indefinite user return hooks, but not by default.
+ TaskWork(t *Task)
+}
+
+// RegisterWork can be used to register additional task work that will be
+// performed prior to returning to user space. See TaskWorker.TaskWork for
+// semantics regarding registration.
+func (t *Task) RegisterWork(work TaskWorker) {
+ t.taskWorkMu.Lock()
+ defer t.taskWorkMu.Unlock()
+ atomic.AddInt32(&t.taskWorkCount, 1)
+ t.taskWork = append(t.taskWork, work)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 52849f5b3..a183b28c1 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
return tg.limits
}
-// release releases the thread group's resources.
-func (tg *ThreadGroup) release() {
+// Release releases the thread group's resources.
+func (tg *ThreadGroup) Release(ctx context.Context) {
// Timers must be destroyed without holding the TaskSet or signal mutexes
// since timers send signals with Timer.mu locked.
tg.itimerRealTimer.Destroy()
@@ -325,7 +326,7 @@ func (tg *ThreadGroup) release() {
it.DestroyTimer()
}
if tg.mounts != nil {
- tg.mounts.DecRef()
+ tg.mounts.DecRef(ctx)
}
}
@@ -366,7 +367,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
// terminal is stolen, and all processes that had it as controlling
// terminal lose it." - tty_ioctl(4)
if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
- if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+ // Stealing requires CAP_SYS_ADMIN in the root user namespace.
+ if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 {
return syserror.EPERM
}
// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index bf2dabb6e..fdadb52c0 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
const TasksLimit = (1 << 16)
// ThreadID is a generic thread identifier.
+//
+// +marshal
type ThreadID int32
// String returns a decimal representation of the ThreadID.
@@ -87,6 +89,13 @@ type TaskSet struct {
// at time of save (but note that this is not necessarily the same thing as
// sync.WaitGroup's zero value).
runningGoroutines sync.WaitGroup `state:"nosave"`
+
+ // aioGoroutines is the number of goroutines running async I/O
+ // callbacks.
+ //
+ // aioGoroutines is not saved but is required to be zero at the time of
+ // save.
+ aioGoroutines sync.WaitGroup `state:"nosave"`
}
// newTaskSet returns a new, empty TaskSet.
@@ -256,6 +265,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
return tasks
}
+// NumTasks returns the number of tasks in ns.
+func (ns *PIDNamespace) NumTasks() int {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return len(ns.tids)
+}
+
// ThreadGroups returns a snapshot of the thread groups in ns.
func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
return ns.ThreadGroupsAppend(nil)
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 7ba7dc50c..2817aa3ba 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -6,6 +6,7 @@ go_library(
name = "time",
srcs = [
"context.go",
+ "tcpip.go",
"time.go",
],
visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go
new file mode 100644
index 000000000..c4474c0cf
--- /dev/null
+++ b/pkg/sentry/kernel/time/tcpip.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+ "sync"
+ "time"
+)
+
+// TcpipAfterFunc waits for duration to elapse according to clock then runs fn.
+// The timer is started immediately and will fire exactly once.
+func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer {
+ timer := &TcpipTimer{
+ clock: clock,
+ }
+ timer.notifier = functionNotifier{
+ fn: func() {
+ // tcpip.Timer.Stop() explicitly states that the function is called in a
+ // separate goroutine that Stop() does not synchronize with.
+ // Timer.Destroy() synchronizes with calls to TimerListener.Notify().
+ // This is semantically meaningful because, in the former case, it's
+ // legal to call tcpip.Timer.Stop() while holding locks that may also be
+ // taken by the function, but this isn't so in the latter case. Most
+ // immediately, Timer calls TimerListener.Notify() while holding
+ // Timer.mu. A deadlock occurs without spawning a goroutine:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify()
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock!
+ //
+ // Spawning a goroutine avoids the deadlock:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify() <- Launches T2
+ // T2:
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, blocks
+ // T1:
+ // => (returns) <- Timer.mu.Unlock() called
+ // T2:
+ // => (continues) <- No deadlock!
+ go func() {
+ timer.Stop()
+ fn()
+ }()
+ },
+ }
+ timer.Reset(duration)
+ return timer
+}
+
+// TcpipTimer is a resettable timer with variable duration expirations.
+// Implements tcpip.Timer, which does not define a Destroy method; instead, all
+// resources are released after timer expiration and calls to Timer.Stop.
+//
+// Must be created by AfterFunc.
+type TcpipTimer struct {
+ // clock is the time source. clock is immutable.
+ clock Clock
+
+ // notifier is called when the Timer expires. notifier is immutable.
+ notifier functionNotifier
+
+ // mu protects t.
+ mu sync.Mutex
+
+ // t stores the latest running Timer. This is replaced whenever Reset is
+ // called since Timer cannot be restarted once it has been Destroyed by Stop.
+ //
+ // This field is nil iff Stop has been called.
+ t *Timer
+}
+
+// Stop implements tcpip.Timer.Stop.
+func (r *TcpipTimer) Stop() bool {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ return false
+ }
+ _, lastSetting := r.t.Swap(Setting{})
+ r.t.Destroy()
+ r.t = nil
+ return lastSetting.Enabled
+}
+
+// Reset implements tcpip.Timer.Reset.
+func (r *TcpipTimer) Reset(d time.Duration) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ r.t = NewTimer(r.clock, &r.notifier)
+ }
+
+ r.t.Swap(Setting{
+ Enabled: true,
+ Period: 0,
+ Next: r.clock.Now().Add(d),
+ })
+}
+
+// functionNotifier is a TimerListener that runs a function.
+//
+// functionNotifier cannot be saved or loaded.
+type functionNotifier struct {
+ fn func()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) {
+ f.fn()
+ return Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (f *functionNotifier) Destroy() {}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
// starts the timer, while setting s.Enabled to false stops it.
//
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+// locked.
func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
now := t.clock.Now()
t.mu.Lock()
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index da0ea7bb5..7c4fefb16 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,8 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/log"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -90,7 +90,7 @@ type Timekeeper struct {
// NewTimekeeper does not take ownership of paramPage.
//
// SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) {
return &Timekeeper{
params: NewVDSOParamPage(mfp, paramPage),
}, nil
@@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() {
timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
t.wg.Add(1)
go func() { // S/R-SAFE: stopped during save.
+ defer t.wg.Done()
for {
// Start with an update immediately, so the clocks are
// ready ASAP.
@@ -209,9 +210,6 @@ func (t *Timekeeper) startUpdater() {
p.realtimeBaseRef = int64(realtimeParams.BaseRef)
p.realtimeFrequency = realtimeParams.Frequency
}
-
- log.Debugf("Updating VDSO parameters: %+v", p)
-
return p
}); err != nil {
log.Warningf("Unable to update VDSO parameter page: %v", err)
@@ -220,7 +218,6 @@ func (t *Timekeeper) startUpdater() {
select {
case <-timer.C:
case <-t.stop:
- t.wg.Done()
return
}
}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index f1b3c212c..9bc452e67 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,10 +17,9 @@ package kernel
import (
"fmt"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -28,6 +27,8 @@ import (
//
// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
// which also includes a sequence counter.
+//
+// +marshal
type vdsoParams struct {
monotonicReady uint64
monotonicBaseCycles int64
@@ -58,7 +59,7 @@ type vdsoParams struct {
type VDSOParamPage struct {
// The parameter page is fr, allocated from mfp.MemoryFile().
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
// seq is the current sequence count written to the page.
//
@@ -68,21 +69,29 @@ type VDSOParamPage struct {
// checked in state_test_util tests, causing this field to change across
// save / restore.
seq uint64
+
+ // copyScratchBuffer is a temporary buffer used to marshal the params before
+ // copying it to the real parameter page. The parameter page is typically
+ // updated at a moderate frequency of ~O(seconds) throughout the lifetime of
+ // the sentry, so reusing this buffer is a good tradeoff between memory
+ // usage and the cost of allocation.
+ copyScratchBuffer []byte
}
// NewVDSOParamPage returns a VDSOParamPage.
//
// Preconditions:
-//
// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
// not take ownership of fr; it must remain allocated for the lifetime of the
// VDSOParamPage.
-//
// * VDSOParamPage must be the only writer to fr.
-//
// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
- return &VDSOParamPage{mfp: mfp, fr: fr}
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
+ return &VDSOParamPage{
+ mfp: mfp,
+ fr: fr,
+ copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()),
+ }
}
// access returns a mapping of the param page.
@@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error {
// Get the new params.
p := f()
- buf := binary.Marshal(nil, usermem.ByteOrder, p)
+ buf := v.copyScratchBuffer[:p.SizeBytes()]
+ p.MarshalUnsafe(buf)
// Skip the sequence counter.
if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 77e1fe217..0bade6e57 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -33,3 +33,12 @@ func FromContext(ctx context.Context) *LimitSet {
}
return nil
}
+
+// FromContextOrDie returns FromContext(ctx) if the latter is not nil.
+// Otherwise, panic is triggered.
+func FromContextOrDie(ctx context.Context) *LimitSet {
+ if v := ctx.Value(CtxLimits); v != nil {
+ return v.(*LimitSet)
+ }
+ panic("failed to create limit set from context")
+}
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index c6aa65f28..34bdb0b69 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -30,9 +30,6 @@ go_library(
"//pkg/rand",
"//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/fs",
- "//pkg/sentry/fs/anon",
- "//pkg/sentry/fs/fsutil",
"//pkg/sentry/fsbridge",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/limits",
@@ -45,6 +42,5 @@ go_library(
"//pkg/syserr",
"//pkg/syserror",
"//pkg/usermem",
- "//pkg/waiter",
],
)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 616fafa2c..98af2cc38 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -90,14 +90,23 @@ type elfInfo struct {
sharedObject bool
}
+// fullReader interface extracts the ReadFull method from fsbridge.File so that
+// client code does not need to define an entire fsbridge.File when only read
+// functionality is needed.
+//
+// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap
+// vfs.FileDescription's PRead/Read instead.
+type fullReader interface {
+ // ReadFull is the same as fsbridge.File.ReadFull.
+ ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+}
+
// parseHeader parse the ELF header, verifying that this is a supported ELF
// file and returning the ELF program headers.
//
// This is similar to elf.NewFile, except that it is more strict about what it
// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
-//
-// ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
// Check ident first; it will tell us the endianness of the rest of the
// structs.
var ident [elf.EI_NIDENT]byte
@@ -185,6 +194,10 @@ func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
return elfInfo{}, syserror.ENOEXEC
}
+ if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 {
+ ctx.Infof("Unsupported phdr offset %d", hdr.Phoff)
+ return elfInfo{}, syserror.ENOEXEC
+ }
phdrBuf := make([]byte, totalPhdrSize)
_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
@@ -272,7 +285,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr
}
defer func() {
if mopts.MappingIdentity != nil {
- mopts.MappingIdentity.DecRef()
+ mopts.MappingIdentity.DecRef(ctx)
}
}()
if err := f.ConfigureMMap(ctx, &mopts); err != nil {
@@ -393,8 +406,7 @@ type loadedELF struct {
//
// It does not load the ELF interpreter, or return any auxv entries.
//
-// Preconditions:
-// * f is an ELF file
+// Preconditions: f is an ELF file.
func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
first := true
var start, end usermem.Addr
@@ -429,6 +441,10 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
return loadedELF{}, syserror.ENOEXEC
}
+ if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 {
+ ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off)
+ return loadedELF{}, syserror.ENOEXEC
+ }
path := make([]byte, phdr.Filesz)
_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
@@ -562,8 +578,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
// It does not load the ELF interpreter, or return any auxv entries.
//
// Preconditions:
-// * f is an ELF file
-// * f is the first ELF loaded into m
+// * f is an ELF file.
+// * f is the first ELF loaded into m.
func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
info, err := parseHeader(ctx, f)
if err != nil {
@@ -600,8 +616,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
//
// It does not return any auxv entries.
//
-// Preconditions:
-// * f is an ELF file
+// Preconditions: f is an ELF file.
func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
info, err := parseHeader(ctx, f)
if err != nil {
@@ -631,8 +646,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil
// If loadELF returns ErrSwitchFile it should be called again with the returned
// path and argv.
//
-// Preconditions:
-// * args.File is an ELF file
+// Preconditions: args.File is an ELF file.
func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
if err != nil {
@@ -654,7 +668,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
return loadedELF{}, nil, err
}
- defer intFile.DecRef()
+ defer intFile.DecRef(ctx)
interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 88449fe95..c69b62db9 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -80,22 +79,6 @@ type LoadArgs struct {
Features *cpuid.FeatureSet
}
-// readFull behaves like io.ReadFull for an *fs.File.
-func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- var total int64
- for dst.NumBytes() > 0 {
- n, err := f.Preadv(ctx, dst, offset+total)
- total += n
- if err == io.EOF && total != 0 {
- return total, io.ErrUnexpectedEOF
- } else if err != nil {
- return total, err
- }
- dst = dst.DropFirst64(n)
- }
- return total, nil
-}
-
// openPath opens args.Filename and checks that it is valid for loading.
//
// openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
@@ -139,7 +122,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
if err != nil {
return nil, err
}
- return &arch.Stack{a, m, ar.End}, nil
+ return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil
}
const (
@@ -171,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
return loadedELF{}, nil, nil, nil, err
}
// Ensure file is release in case the code loops or errors out.
- defer args.File.DecRef()
+ defer args.File.DecRef(ctx)
} else {
if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
return loadedELF{}, nil, nil, nil, err
@@ -232,20 +215,20 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
// path and argv.
//
// Preconditions:
-// * The Task MemoryManager is empty.
-// * Load is called on the Task goroutine.
+// * The Task MemoryManager is empty.
+// * Load is called on the Task goroutine.
func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
// Load the executable itself.
loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
if err != nil {
- return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
+ return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// Load the VDSO.
vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
if err != nil {
- return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
+ return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
}
// Setup the heap. brk starts at the next page after the end of the
@@ -264,20 +247,20 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
}
// Push the original filename to the stack, for AT_EXECFN.
- execfn, err := stack.Push(args.Filename)
- if err != nil {
+ if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
}
+ execfn := stack.Bottom
// Push 16 random bytes on the stack which AT_RANDOM will point to.
var b [16]byte
if _, err := rand.Read(b[:]); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
}
- random, err := stack.Push(b)
- if err != nil {
+ if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
}
+ random := stack.Bottom
c := auth.CredentialsFromContext(ctx)
@@ -309,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
m.SetEnvvStart(sl.EnvvStart)
m.SetEnvvEnd(sl.EnvvEnd)
m.SetAuxv(auxv)
- m.SetExecutable(file)
+ m.SetExecutable(ctx, file)
symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn")
if err != nil {
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 00977fc08..241d87835 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -26,10 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/anon"
- "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,7 +33,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/pkg/waiter"
)
const vdsoPrelink = 0xffffffffff700000
@@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} {
}
}
-// byteReader implements fs.FileOperations for reading from a []byte source.
-type byteReader struct {
- fsutil.FileNoFsync `state:"nosave"`
- fsutil.FileNoIoctl `state:"nosave"`
- fsutil.FileNoMMap `state:"nosave"`
- fsutil.FileNoSplice `state:"nosave"`
- fsutil.FileNoopFlush `state:"nosave"`
- fsutil.FileNoopRelease `state:"nosave"`
- fsutil.FileNotDirReaddir `state:"nosave"`
- fsutil.FilePipeSeek `state:"nosave"`
- fsutil.FileUseInodeUnstableAttr `state:"nosave"`
- waiter.AlwaysReady `state:"nosave"`
-
+type byteFullReader struct {
data []byte
}
-var _ fs.FileOperations = (*byteReader)(nil)
-
-// newByteReaderFile creates a fake file to read data from.
-//
-// TODO(gvisor.dev/issue/1623): Convert to VFS2.
-func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
- // Create a fake inode.
- inode := fs.NewInode(
- ctx,
- &fsutil.SimpleFileInode{},
- fs.NewPseudoMountSource(ctx),
- fs.StableAttr{
- Type: fs.Anonymous,
- DeviceID: anon.PseudoDevice.DeviceID(),
- InodeID: anon.PseudoDevice.NextIno(),
- BlockSize: usermem.PageSize,
- })
-
- // Use the fake inode to create a fake dirent.
- dirent := fs.NewTransientDirent(inode)
- defer dirent.DecRef()
-
- // Use the fake dirent to make a fake file.
- flags := fs.FileFlags{Read: true, Pread: true}
- return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
- data: data,
- })
-}
-
-func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
if offset < 0 {
return 0, syserror.EINVAL
}
@@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ
return int64(n), err
}
-func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
- panic("Write not supported")
-}
-
// validateVDSO checks that the VDSO can be loaded by loadVDSO.
//
// VDSOs are special (see below). Since we are going to map the VDSO directly
@@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
// * PT_LOAD segments don't extend beyond the end of the file.
//
// ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) {
info, err := parseHeader(ctx, f)
if err != nil {
log.Infof("Unable to parse VDSO header: %v", err)
@@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) {
// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
// param page for updating by the kernel.
-func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
- vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
+ vdsoFile := &byteFullReader{data: vdsoBin}
// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
// nil context can be passed.
info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
- vdsoFile.DecRef()
if err != nil {
return nil, err
}
@@ -431,3 +380,9 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF)
return vdsoAddr, nil
}
+
+// Release drops references on mappings held by v.
+func (v *VDSO) Release(ctx context.Context) {
+ v.ParamPage.DecRef(ctx)
+ v.vdso.DecRef(ctx)
+}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index a98b66de1..2c95669cd 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -28,9 +28,21 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "file_range",
+ out = "file_range.go",
+ package = "memmap",
+ prefix = "File",
+ template = "//pkg/segment:generic_range",
+ types = {
+ "T": "uint64",
+ },
+)
+
go_library(
name = "memmap",
srcs = [
+ "file_range.go",
"mappable_range.go",
"mapping_set.go",
"mapping_set_impl.go",
@@ -40,7 +52,7 @@ go_library(
deps = [
"//pkg/context",
"//pkg/log",
- "//pkg/sentry/platform",
+ "//pkg/safemem",
"//pkg/syserror",
"//pkg/usermem",
],
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index d609c1ae0..457ed87f8 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -177,7 +177,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
// AddMapping adds the given mapping and returns the set of MappableRanges that
// previously had no mappings.
//
-// Preconditions: As for Mappable.AddMapping.
+// Preconditions: Same as Mappable.AddMapping.
func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
mr := MappableRange{offset, offset + uint64(ar.Length())}
var mapped []MappableRange
@@ -204,7 +204,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
// RemoveMapping removes the given mapping and returns the set of
// MappableRanges that now have no mappings.
//
-// Preconditions: As for Mappable.RemoveMapping.
+// Preconditions: Same as Mappable.RemoveMapping.
func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
mr := MappableRange{offset, offset + uint64(ar.Length())}
var unmapped []MappableRange
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index c6db9fc8f..7fd77925f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -19,18 +19,18 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/usermem"
)
// Mappable represents a memory-mappable object, a mutable mapping from uint64
-// offsets to (platform.File, uint64 File offset) pairs.
+// offsets to (File, uint64 File offset) pairs.
//
// See mm/mm.go for Mappable's place in the lock order.
//
-// Preconditions: For all Mappable methods, usermem.AddrRanges and
-// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and
-// Mappable offsets must be page-aligned.
+// All Mappable methods have the following preconditions:
+// * usermem.AddrRanges and MappableRanges must be non-empty (Length() != 0).
+// * usermem.Addrs and Mappable offsets must be page-aligned.
type Mappable interface {
// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
// offsets [offset, offset+ar.Length()) in this Mappable.
@@ -48,8 +48,10 @@ type Mappable interface {
// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
// Mappable.
//
- // Preconditions: offset+ar.Length() does not overflow. The removed mapping
- // must exist. writable must match the corresponding call to AddMapping.
+ // Preconditions:
+ // * offset+ar.Length() does not overflow.
+ // * The removed mapping must exist. writable must match the
+ // corresponding call to AddMapping.
RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool)
// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
@@ -60,9 +62,10 @@ type Mappable interface {
// CopyMapping is only called when a mapping is copied within a given
// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
//
- // Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
- // overflow. The mapping at srcAR must exist. writable must match the
- // corresponding call to AddMapping.
+ // Preconditions:
+ // * offset+srcAR.Length() and offset+dstAR.Length() do not overflow.
+ // * The mapping at srcAR must exist. writable must match the
+ // corresponding call to AddMapping.
CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error
// Translate returns the Mappable's current mappings for at least the range
@@ -74,14 +77,17 @@ type Mappable interface {
// Translations are valid until invalidated by a callback to
// MappingSpace.Invalidate or until the caller removes its mapping of the
// translated range. Mappable implementations must ensure that at least one
- // reference is held on all pages in a platform.File that may be the result
+ // reference is held on all pages in a File that may be the result
// of a valid Translation.
//
- // Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
- // required and optional must be page-aligned. The caller must have
- // established a mapping for all of the queried offsets via a previous call
- // to AddMapping. The caller is responsible for ensuring that calls to
- // Translate synchronize with invalidation.
+ // Preconditions:
+ // * required.Length() > 0.
+ // * optional.IsSupersetOf(required).
+ // * required and optional must be page-aligned.
+ // * The caller must have established a mapping for all of the queried
+ // offsets via a previous call to AddMapping.
+ // * The caller is responsible for ensuring that calls to Translate
+ // synchronize with invalidation.
//
// Postconditions: See CheckTranslateResult.
Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error)
@@ -100,7 +106,7 @@ type Translation struct {
Source MappableRange
// File is the mapped file.
- File platform.File
+ File File
// Offset is the offset into File at which this Translation begins.
Offset uint64
@@ -110,18 +116,18 @@ type Translation struct {
Perms usermem.AccessType
}
-// FileRange returns the platform.FileRange represented by t.
-func (t Translation) FileRange() platform.FileRange {
- return platform.FileRange{t.Offset, t.Offset + t.Source.Length()}
+// FileRange returns the FileRange represented by t.
+func (t Translation) FileRange() FileRange {
+ return FileRange{t.Offset, t.Offset + t.Source.Length()}
}
// CheckTranslateResult returns an error if (ts, terr) does not satisfy all
// postconditions for Mappable.Translate(required, optional, at).
//
-// Preconditions: As for Mappable.Translate.
+// Preconditions: Same as Mappable.Translate.
func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error {
// Verify that the inputs to Mappable.Translate were valid.
- if !required.WellFormed() || required.Length() <= 0 {
+ if !required.WellFormed() || required.Length() == 0 {
panic(fmt.Sprintf("invalid required range: %v", required))
}
if !usermem.Addr(required.Start).IsPageAligned() || !usermem.Addr(required.End).IsPageAligned() {
@@ -139,7 +145,7 @@ func CheckTranslateResult(required, optional MappableRange, at usermem.AccessTyp
return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required)
}
for i, t := range ts {
- if !t.Source.WellFormed() || t.Source.Length() <= 0 {
+ if !t.Source.WellFormed() || t.Source.Length() == 0 {
return fmt.Errorf("Translation %+v has invalid Source", t)
}
if !usermem.Addr(t.Source.Start).IsPageAligned() || !usermem.Addr(t.Source.End).IsPageAligned() {
@@ -214,7 +220,9 @@ type MappingSpace interface {
// Invalidate must not take any locks preceding mm.MemoryManager.activeMu
// in the lock order.
//
- // Preconditions: ar.Length() != 0. ar must be page-aligned.
+ // Preconditions:
+ // * ar.Length() != 0.
+ // * ar must be page-aligned.
Invalidate(ar usermem.AddrRange, opts InvalidateOpts)
}
@@ -238,7 +246,7 @@ type MappingIdentity interface {
IncRef()
// DecRef decrements the MappingIdentity's reference count.
- DecRef()
+ DecRef(ctx context.Context)
// MappedName returns the application-visible name shown in
// /proc/[pid]/maps.
@@ -360,4 +368,62 @@ type MMapOpts struct {
//
// TODO(jamieliu): Replace entirely with MappingIdentity?
Hint string
+
+ // Force means to skip validation checks of Addr and Length. It can be
+ // used to create special mappings below mm.layout.MinAddr and
+ // mm.layout.MaxAddr. It has to be used with caution.
+ //
+ // If Force is true, Unmap and Fixed must be true.
+ Force bool
+}
+
+// File represents a host file that may be mapped into an platform.AddressSpace.
+type File interface {
+ // All pages in a File are reference-counted.
+
+ // IncRef increments the reference count on all pages in fr.
+ //
+ // Preconditions:
+ // * fr.Start and fr.End must be page-aligned.
+ // * fr.Length() > 0.
+ // * At least one reference must be held on all pages in fr. (The File
+ // interface does not provide a way to acquire an initial reference;
+ // implementors may define mechanisms for doing so.)
+ IncRef(fr FileRange)
+
+ // DecRef decrements the reference count on all pages in fr.
+ //
+ // Preconditions:
+ // * fr.Start and fr.End must be page-aligned.
+ // * fr.Length() > 0.
+ // * At least one reference must be held on all pages in fr.
+ DecRef(fr FileRange)
+
+ // MapInternal returns a mapping of the given file offsets in the invoking
+ // process' address space for reading and writing.
+ //
+ // Note that fr.Start and fr.End need not be page-aligned.
+ //
+ // Preconditions:
+ // * fr.Length() > 0.
+ // * At least one reference must be held on all pages in fr.
+ //
+ // Postconditions: The returned mapping is valid as long as at least one
+ // reference is held on the mapped pages.
+ MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
+
+ // FD returns the file descriptor represented by the File.
+ //
+ // The only permitted operation on the returned file descriptor is to map
+ // pages from it consistent with the requirements of AddressSpace.MapFile.
+ FD() int
+}
+
+// FileRange represents a range of uint64 offsets into a File.
+//
+// type FileRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (fr FileRange) String() string {
+ return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
}
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a036ce53c..6dbeccfe2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -7,14 +7,14 @@ go_template_instance(
name = "file_refcount_set",
out = "file_refcount_set.go",
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "mm",
prefix = "fileRefcount",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "int32",
"Functions": "fileRefcountSetFunctions",
},
@@ -73,12 +73,35 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "aio_mappable_refs",
+ out = "aio_mappable_refs.go",
+ package = "mm",
+ prefix = "aioMappable",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "aioMappable",
+ },
+)
+
+go_template_instance(
+ name = "special_mappable_refs",
+ out = "special_mappable_refs.go",
+ package = "mm",
+ prefix = "SpecialMappable",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "SpecialMappable",
+ },
+)
+
go_library(
name = "mm",
srcs = [
"address_space.go",
"aio_context.go",
"aio_context_state.go",
+ "aio_mappable_refs.go",
"debug.go",
"file_refcount_set.go",
"io.go",
@@ -92,6 +115,7 @@ go_library(
"save_restore.go",
"shm.go",
"special_mappable.go",
+ "special_mappable_refs.go",
"syscalls.go",
"vma.go",
"vma_set.go",
@@ -103,6 +127,7 @@ go_library(
"//pkg/context",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safecopy",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 5c667117c..a93e76c75 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -166,8 +166,12 @@ func (mm *MemoryManager) Deactivate() {
// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
// for all addresses in ar should be precommitted.
//
-// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * mm.as != nil.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
+// * pseg == mm.pmas.LowerBoundSegment(ar.Start).
func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
// By default, map entire pmas at a time, under the assumption that there
// is no cost to mapping more of a pma than necessary.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 379148903..7bf48cb2c 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -17,10 +17,8 @@ package mm
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -240,10 +238,10 @@ func (ctx *AIOContext) Drain() {
//
// +stateify savable
type aioMappable struct {
- refs.AtomicRefCount
+ aioMappableRefs
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
}
var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
@@ -254,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
return nil, err
}
m := aioMappable{mfp: mfp, fr: fr}
- m.EnableLeakCheck("mm.aioMappable")
+ m.EnableLeakCheck()
return &m, nil
}
// DecRef implements refs.RefCounter.DecRef.
-func (m *aioMappable) DecRef() {
- m.AtomicRefCount.DecRefWithDestructor(func() {
+func (m *aioMappable) DecRef(ctx context.Context) {
+ m.aioMappableRefs.DecRef(func() {
m.mfp.MemoryFile().DecRef(m.fr)
})
}
@@ -368,7 +366,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
if err != nil {
return 0, err
}
- defer m.DecRef()
+ defer m.DecRef(ctx)
addr, err := mm.MMap(ctx, memmap.MMapOpts{
Length: aioRingBufferSize,
MappingIdentity: m,
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index fa776f9c6..a8ac48080 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -441,7 +441,10 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts
// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
// operation spanning ioar.
//
-// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+// Preconditions:
+// * mm.as != nil.
+// * ioar.Length() != 0.
+// * ioar.Contains(addr).
func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
// Try to map all remaining pages in the I/O operation. This RoundUp can't
// overflow because otherwise it would have been caught by CheckIORange.
@@ -629,7 +632,9 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
// truncate usermem.AddrRangeSeq when errors occur.
//
-// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+// Preconditions:
+// * !arsit.IsEmpty().
+// * end <= arsit.Head().End.
func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
ar := arsit.Head()
if end <= ar.Start {
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index aac56679b..09dbc06a4 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -57,6 +57,8 @@ func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arc
// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
// clone() (without CLONE_VM).
func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+ mm.AddressSpace().PreFork()
+ defer mm.AddressSpace().PostFork()
mm.metadataMu.Lock()
defer mm.metadataMu.Unlock()
mm.mappingMu.RLock()
@@ -258,7 +260,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) {
mm.executable = nil
mm.metadataMu.Unlock()
if exe != nil {
- exe.DecRef()
+ exe.DecRef(ctx)
}
mm.activeMu.Lock()
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 28e5057f7..0cfd60f6c 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -15,6 +15,7 @@
package mm
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/usermem"
@@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File {
// SetExecutable sets the executable.
//
// This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
+func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) {
mm.metadataMu.Lock()
// Grab a new reference.
@@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
// Do this without holding the lock, since it may wind up doing some
// I/O to sync the dirent, etc.
if orig != nil {
- orig.DecRef()
+ orig.DecRef(ctx)
}
}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 6db7c3d40..92cc87d84 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -25,7 +25,7 @@
// Locks taken by memmap.Mappable.Translate
// mm.privateRefs.mu
// platform.AddressSpace locks
-// platform.File locks
+// memmap.File locks
// mm.aioManager.mu
// mm.AIOContext.mu
//
@@ -235,6 +235,20 @@ type MemoryManager struct {
// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
vdsoSigReturnAddr uint64
+
+ // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
+ // previously been called. Since, as of this writing,
+ // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
+ // barrier, membarrierPrivateEnabled has no other effect.
+ //
+ // membarrierPrivateEnabled is accessed using atomic memory operations.
+ membarrierPrivateEnabled uint32
+
+ // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
+ // been called.
+ //
+ // membarrierRSeqEnabled is accessed using atomic memory operations.
+ membarrierRSeqEnabled uint32
}
// vma represents a virtual memory area.
@@ -242,7 +256,7 @@ type MemoryManager struct {
// +stateify savable
type vma struct {
// mappable is the virtual memory object mapped by this vma. If mappable is
- // nil, the vma represents a private anonymous mapping.
+ // nil, the vma represents an anonymous mapping.
mappable memmap.Mappable
// off is the offset into mappable at which this vma begins. If mappable is
@@ -396,7 +410,7 @@ type pma struct {
// file is the file mapped by this pma. Only pmas for which file ==
// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
// the corresponding file range while they exist.
- file platform.File `state:"nosave"`
+ file memmap.File `state:"nosave"`
// off is the offset into file at which this pma begins.
//
@@ -436,7 +450,7 @@ type pma struct {
private bool
// If internalMappings is not empty, it is the cached return value of
- // file.MapInternal for the platform.FileRange mapped by this pma.
+ // file.MapInternal for the memmap.FileRange mapped by this pma.
internalMappings safemem.BlockSeq `state:"nosave"`
}
@@ -469,10 +483,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 {
func (fileRefcountSetFunctions) ClearValue(_ *int32) {
}
-func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) {
return rc1, rc1 == rc2
}
-func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) {
return rc, rc
}
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index fdc308542..acac3d357 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) {
defer mm.DecUsers(ctx)
addr, err := mm.MMap(ctx, memmap.MMapOpts{
- Length: 2 * usermem.PageSize,
+ Length: 2 * usermem.PageSize,
+ Private: true,
})
if err != nil {
t.Fatalf("MMap got err %v want nil", err)
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 62e4c20af..7e5f7de64 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/safecopy"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -32,10 +31,12 @@ import (
// iterator to the pma containing ar.Start. Otherwise it returns a terminal
// iterator.
//
-// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * ar.Length() != 0.
func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -90,13 +91,16 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user
//
// - An error that is non-nil if pmas exist for only a subset of ar.
//
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
-// for all addresses in ar, and support accesses of type at (i.e. permission
-// checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * vseg.Range().Contains(ar.Start).
+// * vmas must exist for all addresses in ar, and support accesses of type at
+// (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !vseg.Ok() {
@@ -136,9 +140,11 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
// exist. If this is not equal to ars, it returns a non-nil error explaining
// why.
//
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vmas must exist for all addresses in ars, and support accesses of
-// type at (i.e. permission checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * vmas must exist for all addresses in ars, and support accesses of type at
+// (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
ar := arsit.Head()
@@ -187,7 +193,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
// getVecPMAsLocked; other clients should call one of those instead.
func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !vseg.Ok() {
@@ -217,7 +223,7 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter
// Need a pma here.
optAR := vseg.Range().Intersect(pgap.Range())
if checkInvariants {
- if optAR.Length() <= 0 {
+ if optAR.Length() == 0 {
panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
}
}
@@ -519,8 +525,10 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
// and update the pma to indicate that it does not require copy-on-write.
//
-// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
-// locked. mm.activeMu must be locked for writing.
+// Preconditions:
+// * vseg.Range().IsSupersetOf(pseg.Range()).
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
pma := pseg.ValuePtr()
if !pma.needCOW {
@@ -552,7 +560,7 @@ func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterat
// Invalidate implements memmap.MappingSpace.Invalidate.
func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -569,11 +577,13 @@ func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.Invalidate
// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
// addresses in ar.
//
-// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -604,7 +614,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
}
}
-// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// Pin returns the memmap.File ranges currently mapped by addresses in ar in
// mm, acquiring a reference on the returned ranges which the caller must
// release by calling Unpin. If not all addresses are mapped, Pin returns a
// non-nil error. Note that Pin may return both a non-empty slice of
@@ -614,10 +624,12 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
// most I/O. It should only be used in contexts that would use get_user_pages()
// in the Linux kernel.
//
-// Preconditions: ar.Length() != 0. ar must be page-aligned.
+// Preconditions:
+// * ar.Length() != 0.
+// * ar must be page-aligned.
func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -674,15 +686,15 @@ type PinnedRange struct {
Source usermem.AddrRange
// File is the mapped file.
- File platform.File
+ File memmap.File
// Offset is the offset into File at which this PinnedRange begins.
Offset uint64
}
-// FileRange returns the platform.File offsets mapped by pr.
-func (pr PinnedRange) FileRange() platform.FileRange {
- return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+// FileRange returns the memmap.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() memmap.FileRange {
+ return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
}
// Unpin releases the reference held by prs.
@@ -694,15 +706,19 @@ func Unpin(prs []PinnedRange) {
// movePMAsLocked moves all pmas in oldAR to newAR.
//
-// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
-// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
-// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * oldAR.Length() != 0.
+// * oldAR.Length() <= newAR.Length().
+// * !oldAR.Overlaps(newAR).
+// * mm.pmas.IsEmptyRange(newAR).
+// * oldAR and newAR must be page-aligned.
func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
if checkInvariants {
- if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+ if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
}
- if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+ if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
panic(fmt.Sprintf("invalid newAR: %v", newAR))
}
if oldAR.Length() > newAR.Length() {
@@ -752,15 +768,17 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
// - An error that is non-nil if internal mappings exist for only a subset of
// ar.
//
-// Preconditions: mm.activeMu must be locked for writing.
-// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
-// ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pseg.Range().Contains(ar.Start).
+// * pmas must exist for all addresses in ar.
+// * ar.Length() != 0.
//
// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !pseg.Range().Contains(ar.Start) {
@@ -784,8 +802,9 @@ func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar userm
// internal mappings exist. If this is not equal to ars, it returns a non-nil
// error explaining why.
//
-// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
-// all addresses in ar.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pmas must exist for all addresses in ar.
//
// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
@@ -804,12 +823,15 @@ func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSe
// internalMappingsLocked returns internal mappings for addresses in ar.
//
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ar. ar.Length() != 0.
-// pseg.Range().Contains(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+// in ar.
+// * ar.Length() != 0.
+// * pseg.Range().Contains(ar.Start).
func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !pseg.Range().Contains(ar.Start) {
@@ -840,8 +862,10 @@ func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.Add
// vecInternalMappingsLocked returns internal mappings for addresses in ars.
//
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ars.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+// in ars.
func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
var ims []safemem.Block
for ; !ars.IsEmpty(); ars = ars.Tail() {
@@ -857,7 +881,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf
}
// incPrivateRef acquires a reference on private pages in fr.
-func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
mm.privateRefs.mu.Lock()
defer mm.privateRefs.mu.Unlock()
refSet := &mm.privateRefs.refs
@@ -878,8 +902,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
}
// decPrivateRef releases a reference on private pages in fr.
-func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
- var freed []platform.FileRange
+func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
+ var freed []memmap.FileRange
mm.privateRefs.mu.Lock()
refSet := &mm.privateRefs.refs
@@ -951,7 +975,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa
// Discard internal mappings instead of trying to merge them, since merging
// them requires an allocation and getting them again from the
- // platform.File might not.
+ // memmap.File might not.
pma1.internalMappings = safemem.BlockSeq{}
return pma1, true
}
@@ -970,7 +994,9 @@ func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (p
// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
// so by scanning linearly backward from pgap.
//
-// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+// Preconditions:
+// * mm.activeMu must be locked.
+// * addr <= pgap.Start().
func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
if checkInvariants {
if !pgap.Ok() {
@@ -1012,17 +1038,19 @@ func (pseg pmaIterator) getInternalMappingsLocked() error {
return nil
}
-func (pseg pmaIterator) fileRange() platform.FileRange {
+func (pseg pmaIterator) fileRange() memmap.FileRange {
return pseg.fileRangeOf(pseg.Range())
}
-// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
-func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+// Preconditions:
+// * pseg.Range().IsSupersetOf(ar).
+// * ar.Length != 0.
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
if checkInvariants {
if !pseg.Ok() {
panic("terminal pma iterator")
}
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !pseg.Range().IsSupersetOf(ar) {
@@ -1032,5 +1060,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
pma := pseg.ValuePtr()
pstart := pseg.Start()
- return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+ return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 9ad52082d..2dbe5b751 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -16,10 +16,8 @@ package mm
import (
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -32,10 +30,10 @@ import (
//
// +stateify savable
type SpecialMappable struct {
- refs.AtomicRefCount
+ SpecialMappableRefs
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
name string
}
@@ -44,15 +42,15 @@ type SpecialMappable struct {
// SpecialMappable will use the given name in /proc/[pid]/maps.
//
// Preconditions: fr.Length() != 0.
-func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
m := SpecialMappable{mfp: mfp, fr: fr, name: name}
- m.EnableLeakCheck("mm.SpecialMappable")
+ m.EnableLeakCheck()
return &m
}
// DecRef implements refs.RefCounter.DecRef.
-func (m *SpecialMappable) DecRef() {
- m.AtomicRefCount.DecRefWithDestructor(func() {
+func (m *SpecialMappable) DecRef(ctx context.Context) {
+ m.SpecialMappableRefs.DecRef(func() {
m.mfp.MemoryFile().DecRef(m.fr)
})
}
@@ -126,7 +124,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
// store the SpecialMappable's contents.
-func (m *SpecialMappable) FileRange() platform.FileRange {
+func (m *SpecialMappable) FileRange() memmap.FileRange {
return m.fr
}
@@ -138,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 {
// NewSharedAnonMappable returns a SpecialMappable that implements the
// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
//
-// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
-// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
-// do the same to get non-zero device and inode IDs.
+// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
+// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
+// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
+// and causing memory for shared anonymous mappings to be allocated up-front
+// instead of on first touch; this is to avoid exacerbating the fs.MountSource
+// leak (b/143656263). Delete this function along with VFS1.
func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
if length == 0 {
return nil, syserror.EINVAL
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 3f496aa9f..675efdc7c 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -17,6 +17,7 @@ package mm
import (
"fmt"
mrand "math/rand"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -24,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -93,18 +93,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
}
} else {
opts.Offset = 0
- if !opts.Private {
- if opts.MappingIdentity != nil {
- return 0, syserror.EINVAL
- }
- m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
- if err != nil {
- return 0, err
- }
- defer m.DecRef()
- opts.MappingIdentity = m
- opts.Mappable = m
- }
}
if opts.Addr.RoundDown() != opts.Addr {
@@ -166,7 +154,9 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
// into mm.as if it is active.
//
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * vseg.Range().IsSupersetOf(ar).
func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
if !vseg.ValuePtr().effectivePerms.Any() {
// Linux doesn't populate inaccessible pages. See
@@ -208,8 +198,9 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
// preferable to populateVMA since it unlocks mm.mappingMu before performing
// expensive operations that don't require it to be locked.
//
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * vseg.Range().IsSupersetOf(ar).
//
// Postconditions: mm.mappingMu will be unlocked.
func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
@@ -1191,7 +1182,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui
mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
mm.mappingMu.RUnlock()
err := id.Msync(ctx, mr)
- id.DecRef()
+ id.DecRef(ctx)
if err != nil {
return err
}
@@ -1284,3 +1275,27 @@ func (mm *MemoryManager) VirtualDataSize() uint64 {
defer mm.mappingMu.RUnlock()
return mm.dataAS
}
+
+// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierPrivate() {
+ atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
+}
+
+// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
+ return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
+}
+
+// EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierRSeq() {
+ atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1)
+}
+
+// IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
+ return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 16d8207e9..b8df72813 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -27,8 +27,9 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
-// as defined by the checks in MMap.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * opts must be valid as defined by the checks in MMap.
func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
if opts.MaxPerms != opts.MaxPerms.Effective() {
panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
@@ -42,7 +43,12 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
Map32Bit: opts.Map32Bit,
})
if err != nil {
- return vmaIterator{}, usermem.AddrRange{}, err
+ // Can't force without opts.Unmap and opts.Fixed.
+ if opts.Force && opts.Unmap && opts.Fixed {
+ addr = opts.Addr
+ } else {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
}
ar, _ := addr.ToRange(opts.Length)
@@ -255,11 +261,12 @@ func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
//
// - An error that is non-nil if vmas exist for only a subset of ar.
//
-// Preconditions: mm.mappingMu must be locked for reading; it may be
-// temporarily unlocked. ar.Length() != 0.
+// Preconditions:
+// * mm.mappingMu must be locked for reading; it may be temporarily unlocked.
+// * ar.Length() != 0.
func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -337,11 +344,13 @@ const guardBytes = 256 * usermem.PageSize
// unmapLocked unmaps all addresses in ar and returns the resulting gap in
// mm.vmas.
//
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
-// ar must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -356,11 +365,13 @@ func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange)
// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
// must do so before calling removeVMAsLocked.
//
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
if checkInvariants {
- if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
}
@@ -377,7 +388,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
}
if vma.id != nil {
- vma.id.DecRef()
+ vma.id.DecRef(ctx)
}
mm.usageAS -= uint64(vmaAR.Length())
if vma.isPrivateDataLocked() {
@@ -446,7 +457,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
}
if vma2.id != nil {
- vma2.id.DecRef()
+ vma2.id.DecRef(context.Background())
}
return vma1, true
}
@@ -462,7 +473,9 @@ func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (v
return v, v2
}
-// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().Contains(addr).
func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
if checkInvariants {
if !vseg.Ok() {
@@ -486,8 +499,10 @@ func (vseg vmaIterator) mappableRange() memmap.MappableRange {
return vseg.mappableRangeOf(vseg.Range())
}
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().IsSupersetOf(ar).
+// * ar.Length() != 0.
func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
if checkInvariants {
if !vseg.Ok() {
@@ -496,7 +511,7 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
if vseg.ValuePtr().mappable == nil {
panic("MappableRange is meaningless for anonymous vma")
}
- if !ar.WellFormed() || ar.Length() <= 0 {
+ if !ar.WellFormed() || ar.Length() == 0 {
panic(fmt.Sprintf("invalid ar: %v", ar))
}
if !vseg.Range().IsSupersetOf(ar) {
@@ -509,8 +524,10 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
}
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.mappableRange().IsSupersetOf(mr).
+// * mr.Length() != 0.
func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
if checkInvariants {
if !vseg.Ok() {
@@ -519,7 +536,7 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
if vseg.ValuePtr().mappable == nil {
panic("MappableRange is meaningless for anonymous vma")
}
- if !mr.WellFormed() || mr.Length() <= 0 {
+ if !mr.WellFormed() || mr.Length() == 0 {
panic(fmt.Sprintf("invalid mr: %v", mr))
}
if !vseg.mappableRange().IsSupersetOf(mr) {
@@ -535,7 +552,9 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
// scanning linearly forward from vseg.
//
-// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * addr >= vseg.Start().
func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
if checkInvariants {
if !vseg.Ok() {
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 1eeb9f317..5b09b9feb 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -33,21 +33,42 @@ go_template_instance(
out = "usage_set.go",
consts = {
"minDegree": "10",
+ "trackGaps": "1",
},
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "pgalloc",
prefix = "usage",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "usageInfo",
"Functions": "usageSetFunctions",
},
)
+go_template_instance(
+ name = "reclaim_set",
+ out = "reclaim_set.go",
+ consts = {
+ "minDegree": "10",
+ },
+ imports = {
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
+ },
+ package = "pgalloc",
+ prefix = "reclaim",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "memmap.FileRange",
+ "Value": "reclaimSetValue",
+ "Functions": "reclaimSetFunctions",
+ },
+)
+
go_library(
name = "pgalloc",
srcs = [
@@ -56,20 +77,23 @@ go_library(
"evictable_range_set.go",
"pgalloc.go",
"pgalloc_unsafe.go",
+ "reclaim_set.go",
"save_restore.go",
"usage_set.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/abi/linux",
"//pkg/context",
"//pkg/log",
"//pkg/memutil",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/hostmm",
- "//pkg/sentry/platform",
+ "//pkg/sentry/memmap",
"//pkg/sentry/usage",
"//pkg/state",
+ "//pkg/state/wire",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b11ea4ae..7c297fb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -29,18 +29,19 @@ import (
"syscall"
"time"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/hostmm"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
-// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// MemoryFile is a memmap.File whose pages may be allocated to arbitrary
// users.
type MemoryFile struct {
// opts holds options passed to NewMemoryFile. opts is immutable.
@@ -108,12 +109,6 @@ type MemoryFile struct {
usageSwapped uint64
usageLast time.Time
- // minUnallocatedPage is the minimum page that may be unallocated.
- // i.e., there are no unallocated pages below minUnallocatedPage.
- //
- // minUnallocatedPage is protected by mu.
- minUnallocatedPage uint64
-
// fileSize is the size of the backing memory file in bytes. fileSize is
// always a power-of-two multiple of chunkSize.
//
@@ -146,11 +141,9 @@ type MemoryFile struct {
// is protected by mu.
reclaimable bool
- // minReclaimablePage is the minimum page that may be reclaimable.
- // i.e., all reclaimable pages are >= minReclaimablePage.
- //
- // minReclaimablePage is protected by mu.
- minReclaimablePage uint64
+ // relcaim is the collection of regions for reclaim. relcaim is protected
+ // by mu.
+ reclaim reclaimSet
// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
// transitions from false to true.
@@ -232,6 +225,18 @@ type usageInfo struct {
refs uint64
}
+// canCommit returns true if the tracked region can be committed.
+func (u *usageInfo) canCommit() bool {
+ // refs must be greater than 0 because we assume that reclaimable pages
+ // (that aren't already known to be committed) are not committed. This
+ // isn't necessarily true, even after the reclaimer does Decommit(),
+ // because the kernel may subsequently back the hugepage-sized region
+ // containing the decommitted page with a hugepage. However, it's
+ // consistent with our treatment of unallocated pages, which have the same
+ // property.
+ return !u.knownCommitted && u.refs != 0
+}
+
// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
// may be asked to deallocate that memory in the presence of memory pressure.
type EvictableMemoryUser interface {
@@ -273,12 +278,10 @@ type evictableMemoryUserInfo struct {
}
const (
- chunkShift = 24
- chunkSize = 1 << chunkShift // 16 MB
+ chunkShift = 30
+ chunkSize = 1 << chunkShift // 1 GB
chunkMask = chunkSize - 1
- initialSize = chunkSize
-
// maxPage is the highest 64-bit page.
maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
)
@@ -302,19 +305,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
if err := file.Truncate(0); err != nil {
return nil, err
}
- if err := file.Truncate(initialSize); err != nil {
- return nil, err
- }
f := &MemoryFile{
- opts: opts,
- fileSize: initialSize,
- file: file,
- // No pages are reclaimable. DecRef will always be able to
- // decrease minReclaimablePage from this point.
- minReclaimablePage: maxPage,
- evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+ opts: opts,
+ file: file,
+ evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
}
- f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+ f.mappings.Store(make([]uintptr, 0))
f.reclaimCond.L = &f.mu
if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
@@ -389,7 +385,7 @@ func (f *MemoryFile) Destroy() {
// to Allocate.
//
// Preconditions: length must be page-aligned and non-zero.
-func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) {
if length == 0 || length%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid allocation length: %#x", length))
}
@@ -404,48 +400,38 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
alignment = usermem.HugePageSize
}
- start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
- end := start + length
- // File offsets are int64s. Since length must be strictly positive, end
- // cannot legitimately be 0.
- if end < start || int64(end) <= 0 {
- return platform.FileRange{}, syserror.ENOMEM
+ // Find a range in the underlying file.
+ fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
+ if !ok {
+ return memmap.FileRange{}, syserror.ENOMEM
}
- // Expand the file if needed. Double the file size on each expansion;
- // uncommitted pages have effectively no cost.
- fileSize := f.fileSize
- for int64(end) > fileSize {
- if fileSize >= 2*fileSize {
- // fileSize overflow.
- return platform.FileRange{}, syserror.ENOMEM
+ // Expand the file if needed.
+ if int64(fr.End) > f.fileSize {
+ // Round the new file size up to be chunk-aligned.
+ newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
+ if err := f.file.Truncate(newFileSize); err != nil {
+ return memmap.FileRange{}, err
}
- fileSize *= 2
- }
- if fileSize > f.fileSize {
- if err := f.file.Truncate(fileSize); err != nil {
- return platform.FileRange{}, err
- }
- f.fileSize = fileSize
+ f.fileSize = newFileSize
f.mappingsMu.Lock()
oldMappings := f.mappings.Load().([]uintptr)
- newMappings := make([]uintptr, fileSize>>chunkShift)
+ newMappings := make([]uintptr, newFileSize>>chunkShift)
copy(newMappings, oldMappings)
f.mappings.Store(newMappings)
f.mappingsMu.Unlock()
}
- // Mark selected pages as in use.
- fr := platform.FileRange{start, end}
if f.opts.ManualZeroing {
if err := f.forEachMappingSlice(fr, func(bs []byte) {
for i := range bs {
bs[i] = 0
}
}); err != nil {
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
}
+ // Mark selected pages as in use.
if !f.usage.Add(fr, usageInfo{
kind: kind,
refs: 1,
@@ -453,49 +439,79 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
}
- if minUnallocatedPage < start {
- f.minUnallocatedPage = minUnallocatedPage
- } else {
- // start was the first unallocated page. The next must be
- // somewhere beyond end.
- f.minUnallocatedPage = end
- }
-
return fr, nil
}
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
- // Only searched until the first page is found.
- firstPage := start
- foundFirstPage := false
- alignMask := alignment - 1
- for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
- r := seg.Range()
-
- if !foundFirstPage && r.Start > firstPage {
- foundFirstPage = true
+// findAvailableRange returns an available range in the usageSet.
+//
+// Note that scanning for available slots takes place from end first backwards,
+// then forwards. This heuristic has important consequence for how sequential
+// mappings can be merged in the host VMAs, given that addresses for both
+// application and sentry mappings are allocated top-down (from higher to
+// lower addresses). The file is also grown expoentially in order to create
+// space for mappings to be allocated downwards.
+//
+// Precondition: alignment must be a power of 2.
+func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
+ alignmentMask := alignment - 1
+
+ // Search for space in existing gaps, starting at the current end of the
+ // file and working backward.
+ lastGap := usage.LastGap()
+ gap := lastGap
+ for {
+ end := gap.End()
+ if end > uint64(fileSize) {
+ end = uint64(fileSize)
}
- if start >= r.End {
- // start was rounded up to an alignment boundary from the end
- // of a previous segment and is now beyond r.End.
- continue
+ // Try to allocate from the end of this gap, with the start of the
+ // allocated range aligned down to alignment.
+ unalignedStart := end - length
+ if unalignedStart > end {
+ // Negative overflow: this and all preceding gaps are too small to
+ // accommodate length.
+ break
}
- // This segment represents allocated or reclaimable pages; only the
- // range from start to the segment's beginning is allocatable, and the
- // next allocatable range begins after the segment.
- if r.Start > start && r.Start-start >= length {
+ if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
+ return memmap.FileRange{start, start + length}, true
+ }
+
+ gap = gap.PrevLargeEnoughGap(length)
+ if !gap.Ok() {
break
}
- start = (r.End + alignMask) &^ alignMask
- if !foundFirstPage {
- firstPage = r.End
+ }
+
+ // Check that it's possible to fit this allocation at the end of a file of any size.
+ min := lastGap.Start()
+ min = (min + alignmentMask) &^ alignmentMask
+ if min+length < min {
+ // Overflow: allocation would exceed the range of uint64.
+ return memmap.FileRange{}, false
+ }
+
+ // Determine the minimum file size required to fit this allocation at its end.
+ for {
+ newFileSize := 2 * fileSize
+ if newFileSize <= fileSize {
+ if fileSize != 0 {
+ // Overflow: allocation would exceed the range of int64.
+ return memmap.FileRange{}, false
+ }
+ newFileSize = chunkSize
+ }
+ fileSize = newFileSize
+
+ unalignedStart := uint64(fileSize) - length
+ if unalignedStart > uint64(fileSize) {
+ // Negative overflow: fileSize is still inadequate.
+ continue
+ }
+ if start := unalignedStart &^ alignmentMask; start >= min {
+ return memmap.FileRange{start, start + length}, true
}
}
- return start, firstPage
}
// AllocateAndFill allocates memory of the given kind and fills it by calling
@@ -504,23 +520,25 @@ func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uin
// nearest page. If this is shorter than length bytes due to an error returned
// by r.ReadToBlocks(), it returns that error.
//
-// Preconditions: length > 0. length must be page-aligned.
-func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+// Preconditions:
+// * length > 0.
+// * length must be page-aligned.
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
fr, err := f.Allocate(length, kind)
if err != nil {
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
dsts, err := f.MapInternal(fr, usermem.Write)
if err != nil {
f.DecRef(fr)
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
n, err := safemem.ReadFullToBlocks(r, dsts)
un := uint64(usermem.Addr(n).RoundDown())
if un < length {
// Free unused memory and update fr to contain only the memory that is
// still allocated.
- f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+ f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
fr.End = fr.Start + un
}
return fr, err
@@ -537,7 +555,7 @@ const (
// will read zeroes.
//
// Preconditions: fr.Length() > 0.
-func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -557,7 +575,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error {
return nil
}
-func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
f.mu.Lock()
defer f.mu.Unlock()
// Since we're changing the knownCommitted attribute, we need to merge
@@ -578,8 +596,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
f.usage.MergeRange(fr)
}
-// IncRef implements platform.File.IncRef.
-func (f *MemoryFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (f *MemoryFile) IncRef(fr memmap.FileRange) {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -597,8 +615,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) {
f.usage.MergeAdjacent(fr)
}
-// DecRef implements platform.File.DecRef.
-func (f *MemoryFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (f *MemoryFile) DecRef(fr memmap.FileRange) {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -616,6 +634,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
}
val.refs--
if val.refs == 0 {
+ f.reclaim.Add(seg.Range(), reclaimSetValue{})
freed = true
// Reclassify memory as System, until it's freed by the reclaim
// goroutine.
@@ -628,17 +647,13 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
f.usage.MergeAdjacent(fr)
if freed {
- if fr.Start < f.minReclaimablePage {
- // We've freed at least one lower page.
- f.minReclaimablePage = fr.Start
- }
f.reclaimable = true
f.reclaimCond.Signal()
}
}
-// MapInternal implements platform.File.MapInternal.
-func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
if !fr.WellFormed() || fr.Length() == 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -664,7 +679,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (
// forEachMappingSlice invokes fn on a sequence of byte slices that
// collectively map all bytes in fr.
-func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
mappings := f.mappings.Load().([]uintptr)
for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
chunk := int(chunkStart >> chunkShift)
@@ -826,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error {
log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
return nil
}
+ // Linux updates usage values at CONFIG_HZ.
+ if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
+ log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
+ return nil
+ }
f.usageLast = time.Now()
err = f.updateUsageLocked(currentUsage, mincore)
@@ -839,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error {
// pages by invoking checkCommitted, which is a function that, for each page i
// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
//
-// Precondition: f.mu must be held.
+// Precondition: f.mu must be held; it may be unlocked and reacquired.
func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
// Track if anything changed to elide the merge. In the common case, we
// expect all segments to be committed and no merge to occur.
@@ -866,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
} else if f.usageSwapped != 0 {
// We have more usage accounted for than the file itself.
// That's fine, we probably caught a race where pages were
- // being committed while the above loop was running. Just
+ // being committed while the below loop was running. Just
// report the higher number that we found and ignore swap.
usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
f.usageSwapped = 0
@@ -878,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
// Iterate over all usage data. There will only be usage segments
// present when there is an associated reference.
- for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- val := seg.Value()
-
- // Already known to be committed; ignore.
- if val.knownCommitted {
- continue
- }
-
- // Assume that reclaimable pages (that aren't already known to be
- // committed) are not committed. This isn't necessarily true, even
- // after the reclaimer does Decommit(), because the kernel may
- // subsequently back the hugepage-sized region containing the
- // decommitted page with a hugepage. However, it's consistent with our
- // treatment of unallocated pages, which have the same property.
- if val.refs == 0 {
+ for seg := f.usage.FirstSegment(); seg.Ok(); {
+ if !seg.ValuePtr().canCommit() {
+ seg = seg.NextSegment()
continue
}
@@ -915,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
}
// Query for new pages in core.
- if err := checkCommitted(s, buf); err != nil {
+ // NOTE(b/165896008): mincore (which is passed as checkCommitted)
+ // by f.UpdateUsage() might take a really long time. So unlock f.mu
+ // while checkCommitted runs.
+ f.mu.Unlock()
+ err := checkCommitted(s, buf)
+ f.mu.Lock()
+ if err != nil {
checkErr = err
return
}
// Scan each page and switch out segments.
- populatedRun := false
- populatedRunStart := 0
- for i := 0; i <= bufLen; i++ {
- // We run past the end of the slice here to
- // simplify the logic and only set populated if
- // we're still looking at elements.
- populated := false
- if i < bufLen {
- populated = buf[i]&0x1 != 0
- }
-
- switch {
- case populated == populatedRun:
- // Keep the run going.
- continue
- case populated && !populatedRun:
- // Begin the run.
- populatedRun = true
- populatedRunStart = i
- // Keep going.
+ seg := f.usage.LowerBoundSegment(r.Start)
+ for i := 0; i < bufLen; {
+ if buf[i]&0x1 == 0 {
+ i++
continue
- case !populated && populatedRun:
- // Finish the run by changing this segment.
- runRange := platform.FileRange{
- Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
- End: r.Start + uint64(i*usermem.PageSize),
+ }
+ // Scan to the end of this committed range.
+ j := i + 1
+ for ; j < bufLen; j++ {
+ if buf[j]&0x1 == 0 {
+ break
}
- seg = f.usage.Isolate(seg, runRange)
- seg.ValuePtr().knownCommitted = true
- // Advance the segment only if we still
- // have work to do in the context of
- // the original segment from the for
- // loop. Otherwise, the for loop itself
- // will advance the segment
- // appropriately.
- if runRange.End != r.End {
- seg = seg.NextSegment()
+ }
+ committedFR := memmap.FileRange{
+ Start: r.Start + uint64(i*usermem.PageSize),
+ End: r.Start + uint64(j*usermem.PageSize),
+ }
+ // Advance seg to committedFR.Start.
+ for seg.Ok() && seg.End() < committedFR.Start {
+ seg = seg.NextSegment()
+ }
+ // Mark pages overlapping committedFR as committed.
+ for seg.Ok() && seg.Start() < committedFR.End {
+ if seg.ValuePtr().canCommit() {
+ seg = f.usage.Isolate(seg, committedFR)
+ seg.ValuePtr().knownCommitted = true
+ amount := seg.Range().Length()
+ usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
+ f.usageExpected += amount
+ changedAny = true
}
- amount := runRange.Length()
- usage.MemoryAccounting.Inc(amount, val.kind)
- f.usageExpected += amount
- changedAny = true
- populatedRun = false
+ seg = seg.NextSegment()
}
+ // Continue scanning for committed pages.
+ i = j + 1
}
// Advance r.Start.
@@ -976,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
if err != nil {
return err
}
+
+ // Continue with the first segment after r.End.
+ seg = f.usage.LowerBoundSegment(r.End)
}
return nil
@@ -1009,7 +1017,7 @@ func (f *MemoryFile) File() *os.File {
return f.file
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (f *MemoryFile) FD() int {
return int(f.file.Fd())
}
@@ -1030,6 +1038,7 @@ func (f *MemoryFile) String() string {
// for allocation.
func (f *MemoryFile) runReclaim() {
for {
+ // N.B. We must call f.markReclaimed on the returned FrameRange.
fr, ok := f.findReclaimable()
if !ok {
break
@@ -1085,13 +1094,17 @@ func (f *MemoryFile) runReclaim() {
}
}
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+// findReclaimable finds memory that has been marked for reclaim.
+//
+// Note that there returned range will be removed from tracking. It
+// must be reclaimed (removed from f.usage) at this point.
+func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
f.mu.Lock()
defer f.mu.Unlock()
for {
for {
if f.destroyed {
- return platform.FileRange{}, false
+ return memmap.FileRange{}, false
}
if f.reclaimable {
break
@@ -1103,27 +1116,24 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
}
f.reclaimCond.Wait()
}
- // Allocate returns the first usable range in offset order and is
- // currently a linear scan, so reclaiming from the beginning of the
- // file minimizes the expected latency of Allocate.
- for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
- if seg.ValuePtr().refs == 0 {
- f.minReclaimablePage = seg.End()
- return seg.Range(), true
- }
+ // Allocate works from the back of the file inwards, so reclaim
+ // preserves this order to minimize the cost of the search.
+ if seg := f.reclaim.LastSegment(); seg.Ok() {
+ fr := seg.Range()
+ f.reclaim.Remove(seg)
+ return fr, true
}
- // No pages are reclaimable.
+ // Nothing is reclaimable.
f.reclaimable = false
- f.minReclaimablePage = maxPage
}
}
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
f.mu.Lock()
defer f.mu.Unlock()
seg := f.usage.FindSegment(fr.Start)
- // All of fr should be mapped to a single uncommitted reclaimable segment
- // accounted to System.
+ // All of fr should be mapped to a single uncommitted reclaimable
+ // segment accounted to System.
if !seg.Ok() {
panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
}
@@ -1137,14 +1147,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
}); got != want {
panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
}
- // Deallocate reclaimed pages. Even though all of seg is reclaimable, the
- // caller of markReclaimed may not have decommitted it, so we can only mark
- // fr as reclaimed.
+ // Deallocate reclaimed pages. Even though all of seg is reclaimable,
+ // the caller of markReclaimed may not have decommitted it, so we can
+ // only mark fr as reclaimed.
f.usage.Remove(f.usage.Isolate(seg, fr))
- if fr.Start < f.minUnallocatedPage {
- // We've deallocated at least one lower page.
- f.minUnallocatedPage = fr.Start
- }
}
// StartEvictions requests that f evict all evictable allocations. It does not
@@ -1169,8 +1175,10 @@ func (f *MemoryFile) startEvictionsLocked() bool {
return startedAny
}
-// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
-// locked.
+// Preconditions:
+// * info == f.evictable[user].
+// * !info.evicting.
+// * f.mu must be locked.
func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
info.evicting = true
f.evictionWG.Add(1)
@@ -1224,11 +1232,11 @@ func (usageSetFunctions) MaxKey() uint64 {
func (usageSetFunctions) ClearValue(val *usageInfo) {
}
-func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
return val1, val1 == val2
}
-func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
return val, val
}
@@ -1255,3 +1263,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal
func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
return evictableRangeSetValue{}, evictableRangeSetValue{}
}
+
+// reclaimSetValue is the value type of reclaimSet.
+type reclaimSetValue struct{}
+
+type reclaimSetFunctions struct{}
+
+func (reclaimSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+func (reclaimSetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
+}
+
+func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
+ return reclaimSetValue{}, true
+}
+
+func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
+ return reclaimSetValue{}, reclaimSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 293f22c6b..405db141f 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -23,39 +23,49 @@ import (
const (
page = usermem.PageSize
hugepage = usermem.HugePageSize
+ topPage = (1 << 63) - page
)
func TestFindUnallocatedRange(t *testing.T) {
for _, test := range []struct {
- desc string
- usage *usageSegmentDataSlices
- start uint64
- length uint64
- alignment uint64
- unallocated uint64
- minUnallocated uint64
+ desc string
+ usage *usageSegmentDataSlices
+ fileSize int64
+ length uint64
+ alignment uint64
+ start uint64
+ expectFail bool
}{
{
- desc: "Initial allocation succeeds",
- usage: &usageSegmentDataSlices{},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 0,
- minUnallocated: 0,
+ desc: "Initial allocation succeeds",
+ usage: &usageSegmentDataSlices{},
+ length: page,
+ alignment: page,
+ start: chunkSize - page, // Grows by chunkSize, allocate down.
},
{
- desc: "Allocation begins at start of file",
+ desc: "Allocation finds empty space at start of file",
usage: &usageSegmentDataSlices{
Start: []uint64{page},
End: []uint64{2 * page},
Values: []usageInfo{{refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 0,
- minUnallocated: 0,
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: 0,
+ },
+ {
+ desc: "Allocation finds empty space at end of file",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{0},
+ End: []uint64{page},
+ Values: []usageInfo{{refs: 1}},
+ },
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: page,
},
{
desc: "In-use frames are not allocatable",
@@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 2 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: 3 * page, // Double fileSize, allocate top-down.
},
{
desc: "Reclaimable frames are not allocatable",
@@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 2 * page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 3 * page,
- minUnallocated: 3 * page,
+ fileSize: 3 * page,
+ length: page,
+ alignment: page,
+ start: 5 * page, // Double fileSize, grow down.
},
{
desc: "Gaps between in-use frames are allocatable",
@@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: page,
- minUnallocated: page,
+ fileSize: 3 * page,
+ length: page,
+ alignment: page,
+ start: page,
},
{
desc: "Inadequately-sized gaps are rejected",
@@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: 2 * page,
- alignment: page,
- unallocated: 3 * page,
- minUnallocated: page,
+ fileSize: 3 * page,
+ length: 2 * page,
+ alignment: page,
+ start: 4 * page, // Double fileSize, grow down.
},
{
- desc: "Hugepage alignment is honored",
+ desc: "Alignment is honored at end of file",
usage: &usageSegmentDataSlices{
Start: []uint64{0, hugepage + page},
// Hugepage-sized gap here that shouldn't be allocated from
@@ -118,37 +124,103 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, hugepage + 2*page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: hugepage,
- alignment: hugepage,
- unallocated: 2 * hugepage,
- minUnallocated: page,
+ fileSize: hugepage + 2*page,
+ length: hugepage,
+ alignment: hugepage,
+ start: 3 * hugepage, // Double fileSize until alignment is satisfied, grow down.
},
{
- desc: "Pages before start ignored",
+ desc: "Alignment is honored before end of file",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{0, 2*hugepage + page},
+ // Page will need to be shifted down from top.
+ End: []uint64{page, 2*hugepage + 2*page},
+ Values: []usageInfo{{refs: 1}, {refs: 1}},
+ },
+ fileSize: 2*hugepage + 2*page,
+ length: hugepage,
+ alignment: hugepage,
+ start: hugepage,
+ },
+ {
+ desc: "Allocation doubles file size more than once if necessary",
+ usage: &usageSegmentDataSlices{},
+ fileSize: page,
+ length: 4 * page,
+ alignment: page,
+ start: 0,
+ },
+ {
+ desc: "Allocations are compact if possible",
usage: &usageSegmentDataSlices{
Start: []uint64{page, 3 * page},
End: []uint64{2 * page, 4 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: page,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 4 * page,
+ length: page,
+ alignment: page,
+ start: 2 * page,
+ },
+ {
+ desc: "Top-down allocation within one gap",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, 4 * page, 7 * page},
+ End: []uint64{2 * page, 5 * page, 8 * page},
+ Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+ },
+ fileSize: 8 * page,
+ length: page,
+ alignment: page,
+ start: 6 * page,
+ },
+ {
+ desc: "Top-down allocation between multiple gaps",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, 3 * page, 5 * page},
+ End: []uint64{2 * page, 4 * page, 6 * page},
+ Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+ },
+ fileSize: 6 * page,
+ length: page,
+ alignment: page,
+ start: 4 * page,
},
{
- desc: "start may be in the middle of segment",
+ desc: "Top-down allocation with large top gap",
usage: &usageSegmentDataSlices{
- Start: []uint64{0, 3 * page},
+ Start: []uint64{page, 3 * page},
End: []uint64{2 * page, 4 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: page,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 8 * page,
+ length: page,
+ alignment: page,
+ start: 7 * page,
+ },
+ {
+ desc: "Gaps found with possible overflow",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, topPage - page},
+ End: []uint64{2 * page, topPage},
+ Values: []usageInfo{{refs: 1}, {refs: 1}},
+ },
+ fileSize: topPage,
+ length: page,
+ alignment: page,
+ start: topPage - 2*page,
+ },
+ {
+ desc: "Overflow detected",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page},
+ End: []uint64{topPage},
+ Values: []usageInfo{{refs: 1}},
+ },
+ fileSize: topPage,
+ length: 2 * page,
+ alignment: page,
+ expectFail: true,
},
} {
t.Run(test.desc, func(t *testing.T) {
@@ -156,12 +228,18 @@ func TestFindUnallocatedRange(t *testing.T) {
if err := usage.ImportSortedSlices(test.usage); err != nil {
t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
}
- unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
- if unallocated != test.unallocated {
- t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+ fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment)
+ if !test.expectFail && !ok {
+ t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+ }
+ if test.expectFail && ok {
+ t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+ }
+ if ok && fr.Start != test.start {
+ t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
}
- if minUnallocated != test.minUnallocated {
- t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+ if ok && fr.End != test.start+test.length {
+ t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length)
}
})
}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index f8385c146..78317fa35 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -26,11 +26,12 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/state/wire"
"gvisor.dev/gvisor/pkg/usermem"
)
// SaveTo writes f's state to the given stream.
-func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
+func (f *MemoryFile) SaveTo(ctx context.Context, w wire.Writer) error {
// Wait for reclaim.
f.mu.Lock()
defer f.mu.Unlock()
@@ -79,10 +80,10 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
}
// Save metadata.
- if err := state.Save(ctx, w, &f.fileSize, nil); err != nil {
+ if _, err := state.Save(ctx, w, &f.fileSize); err != nil {
return err
}
- if err := state.Save(ctx, w, &f.usage, nil); err != nil {
+ if _, err := state.Save(ctx, w, &f.usage); err != nil {
return err
}
@@ -115,9 +116,9 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
}
// LoadFrom loads MemoryFile state from the given stream.
-func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
+func (f *MemoryFile) LoadFrom(ctx context.Context, r wire.Reader) error {
// Load metadata.
- if err := state.Load(ctx, r, &f.fileSize, nil); err != nil {
+ if _, err := state.Load(ctx, r, &f.fileSize); err != nil {
return err
}
if err := f.file.Truncate(f.fileSize); err != nil {
@@ -125,7 +126,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
}
newMappings := make([]uintptr, f.fileSize>>chunkShift)
f.mappings.Store(newMappings)
- if err := state.Load(ctx, r, &f.usage, nil); err != nil {
+ if _, err := state.Load(ctx, r, &f.usage); err != nil {
return err
}
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 453241eca..db7d55ef2 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,39 +1,22 @@
load("//tools:defs.bzl", "go_library")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
-go_template_instance(
- name = "file_range",
- out = "file_range.go",
- package = "platform",
- prefix = "File",
- template = "//pkg/segment:generic_range",
- types = {
- "T": "uint64",
- },
-)
-
go_library(
name = "platform",
srcs = [
"context.go",
- "file_range.go",
"mmap_min_addr.go",
"platform.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/atomicbitops",
"//pkg/context",
- "//pkg/log",
- "//pkg/safecopy",
- "//pkg/safemem",
"//pkg/seccomp",
"//pkg/sentry/arch",
- "//pkg/sentry/usage",
- "//pkg/syserror",
+ "//pkg/sentry/hostmm",
+ "//pkg/sentry/memmap",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 57be41647..9dfac3eae 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -54,8 +54,9 @@ type Forwarder struct {
// }
// defer f.Disable()
//
-// Preconditions: r must not be nil. f must not already be forwarding
-// interrupts to a Receiver.
+// Preconditions:
+// * r must not be nil.
+// * f must not already be forwarding interrupts to a Receiver.
func (f *Forwarder) Enable(r Receiver) bool {
if r == nil {
panic("nil Receiver")
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..8ce411102 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -6,15 +6,15 @@ go_library(
name = "kvm",
srcs = [
"address_space.go",
- "allocator.go",
"bluepill.go",
+ "bluepill_allocator.go",
"bluepill_amd64.go",
- "bluepill_amd64.s",
"bluepill_amd64_unsafe.go",
"bluepill_arm64.go",
"bluepill_arm64.s",
"bluepill_arm64_unsafe.go",
"bluepill_fault.go",
+ "bluepill_impl_amd64.s",
"bluepill_unsafe.go",
"context.go",
"filters_amd64.go",
@@ -41,12 +41,14 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/atomicbitops",
+ "//pkg/context",
"//pkg/cpuid",
"//pkg/log",
"//pkg/procid",
"//pkg/safecopy",
"//pkg/seccomp",
"//pkg/sentry/arch",
+ "//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
"//pkg/sentry/platform/ring0",
@@ -54,12 +56,15 @@ go_library(
"//pkg/sentry/time",
"//pkg/sync",
"//pkg/usermem",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
go_test(
name = "kvm_test",
srcs = [
+ "kvm_amd64_test.go",
+ "kvm_arm64_test.go",
"kvm_test.go",
"virtual_map_test.go",
],
@@ -75,6 +80,15 @@ go_test(
"//pkg/sentry/platform/kvm/testutil",
"//pkg/sentry/platform/ring0",
"//pkg/sentry/platform/ring0/pagetables",
+ "//pkg/sentry/time",
"//pkg/usermem",
],
)
+
+genrule(
+ name = "bluepill_impl_amd64",
+ srcs = ["bluepill_amd64.s"],
+ outs = ["bluepill_impl_amd64.s"],
+ cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+ tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index be213bfe8..af5c5e191 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/atomicbitops"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sync"
@@ -26,16 +27,15 @@ import (
// dirtySet tracks vCPUs for invalidation.
type dirtySet struct {
- vCPUs []uint64
+ vCPUMasks []uint64
}
// forEach iterates over all CPUs in the dirty set.
+//
+//go:nosplit
func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
- m.mu.RLock()
- defer m.mu.RUnlock()
-
- for index := range ds.vCPUs {
- mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+ for index := range ds.vCPUMasks {
+ mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
if mask != 0 {
for bit := 0; bit < 64; bit++ {
if mask&(1<<uint64(bit)) == 0 {
@@ -54,7 +54,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
index := uint64(c.id) / 64
bit := uint64(1) << uint(c.id%64)
- oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+ oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
if oldValue&bit != 0 {
return false // Not clean.
}
@@ -62,7 +62,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
// Set the bit unilaterally, and ensure that a flush takes place. Note
// that it's possible for races to occur here, but since the flush is
// taking place long after these lines there's no race in practice.
- atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+ atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
return true // Previously clean.
}
@@ -113,7 +113,12 @@ type hostMapEntry struct {
length uintptr
}
-func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+// mapLocked maps the given host entry.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
for m.length > 0 {
physical, length, ok := translateToPhysical(m.addr)
if !ok {
@@ -133,18 +138,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
// important; if the pagetable mappings were installed before
// ensuring the physical pages were available, then some other
// thread could theoretically access them.
- //
- // Due to the way KVM's shadow paging implementation works,
- // modifications to the page tables while in host mode may not
- // be trapped, leading to the shadow pages being out of sync.
- // Therefore, we need to ensure that we are in guest mode for
- // page table modifications. See the call to bluepill, below.
- as.machine.retryInGuest(func() {
- inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
- AccessType: at,
- User: true,
- }, physical) || inv
- })
+ inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+ AccessType: at,
+ User: true,
+ }, physical) || inv
m.addr += length
m.length -= length
addr += usermem.Addr(length)
@@ -154,7 +151,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
}
// MapFile implements platform.AddressSpace.MapFile.
-func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
as.mu.Lock()
defer as.mu.Unlock()
@@ -176,6 +173,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
return err
}
+ // See block in mapLocked.
+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+
// Map the mappings in the sentry's address space (guest physical memory)
// into the application's address space (guest virtual memory).
inv := false
@@ -190,7 +191,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
_ = s[i] // Touch to commit.
}
}
- prev := as.mapHost(addr, hostMapEntry{
+
+ // See bluepill_allocator.go.
+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+ // Perform the mapping.
+ prev := as.mapLocked(addr, hostMapEntry{
addr: b.Addr(),
length: uintptr(b.Len()),
}, at)
@@ -204,17 +210,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
return nil
}
+// unmapLocked is an escape-checked wrapped around Unmap.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool {
+ return as.pageTables.Unmap(addr, uintptr(length))
+}
+
// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
as.mu.Lock()
defer as.mu.Unlock()
- // See above re: retryInGuest.
- var prev bool
- as.machine.retryInGuest(func() {
- prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
- })
- if prev {
+ // See above & bluepill_allocator.go.
+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+ if prev := as.unmapLocked(addr, length); prev {
+ // Invalidate all active vCPUs.
as.invalidate()
// Recycle any freed intermediate pages.
@@ -227,8 +243,14 @@ func (as *addressSpace) Release() {
as.Unmap(0, ^uint64(0))
// Free all pages from the allocator.
- as.pageTables.Allocator.(allocator).base.Drain()
+ as.pageTables.Allocator.(*allocator).base.Drain()
// Drop all cached machine references.
as.machine.dropPageTables(as.pageTables)
}
+
+// PreFork implements platform.AddressSpace.PreFork.
+func (as *addressSpace) PreFork() {}
+
+// PostFork implements platform.AddressSpace.PostFork.
+func (as *addressSpace) PostFork() {}
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go
index 3f35414bb..9485e1301 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/bluepill_allocator.go
@@ -21,56 +21,80 @@ import (
)
type allocator struct {
- base *pagetables.RuntimeAllocator
+ base pagetables.RuntimeAllocator
+
+ // cpu must be set prior to any pagetable operation.
+ //
+ // Due to the way KVM's shadow paging implementation works,
+ // modifications to the page tables while in host mode may not be
+ // trapped, leading to the shadow pages being out of sync. Therefore,
+ // we need to ensure that we are in guest mode for page table
+ // modifications. See the call to bluepill, below.
+ cpu *vCPU
}
// newAllocator is used to define the allocator.
-func newAllocator() allocator {
- return allocator{
- base: pagetables.NewRuntimeAllocator(),
- }
+func newAllocator() *allocator {
+ a := new(allocator)
+ a.base.Init()
+ return a
}
// NewPTEs implements pagetables.Allocator.NewPTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) NewPTEs() *pagetables.PTEs {
- return a.base.NewPTEs()
+func (a *allocator) NewPTEs() *pagetables.PTEs {
+ ptes := a.base.NewPTEs() // escapes: bluepill below.
+ if a.cpu != nil {
+ bluepill(a.cpu)
+ }
+ return ptes
}
// PhysicalFor returns the physical address for a set of PTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
virtual := a.base.PhysicalFor(ptes)
physical, _, ok := translateToPhysical(virtual)
if !ok {
- panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+ panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic.
}
return physical
}
// LookupPTEs implements pagetables.Allocator.LookupPTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
if !ok {
- panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+ panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic.
}
return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
}
// FreePTEs implements pagetables.Allocator.FreePTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
- a.base.FreePTEs(ptes)
+func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
+ a.base.FreePTEs(ptes) // escapes: bluepill below.
+ if a.cpu != nil {
+ bluepill(a.cpu)
+ }
}
// Recycle implements pagetables.Allocator.Recycle.
//
//go:nosplit
-func (a allocator) Recycle() {
+func (a *allocator) Recycle() {
a.base.Recycle()
}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 133c2203d..ddc1554d5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
// KernelSyscall handles kernel syscalls.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelSyscall() {
regs := c.Registers()
@@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() {
// We only trigger a bluepill entry in the bluepill function, and can
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt. We only worry about saving on exit.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+ ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+ ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
}
// KernelException handles kernel exceptions.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelException(vector ring0.Vector) {
regs := c.Registers()
@@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
regs.Rip = 0
}
// See above.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+ ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+ ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
}
// bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 2bc34a435..025ea93b5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -19,11 +19,6 @@
// This is guaranteed to be zero.
#define VCPU_CPU 0x0
-// CPU_SELF is the self reference in ring0's percpu.
-//
-// This is guaranteed to be zero.
-#define CPU_SELF 0x0
-
// Context offsets.
//
// Only limited use of the context is done in the assembly stub below, most is
@@ -44,7 +39,7 @@ begin:
LEAQ VCPU_CPU(AX), BX
BYTE CLI;
check_vcpu:
- MOVQ CPU_SELF(GS), CX
+ MOVQ ENTRY_CPU_SELF(GS), CX
CMPQ BX, CX
JE right_vCPU
wrong_vcpu:
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 99cac665d..0a54dd30d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -17,6 +17,7 @@
package kvm
import (
+ "syscall"
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -53,3 +54,63 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
context.Rip = uint64(dieTrampolineAddr)
}
+
+// getHypercallID returns hypercall ID.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+ return _KVM_HYPERCALL_MAX
+}
+
+// bluepillStopGuest is reponsible for injecting interrupt.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+ // Interrupt: we must have requested an interrupt
+ // window; set the interrupt line.
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_INTERRUPT,
+ uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+ throw("interrupt injection failed")
+ }
+ // Clear previous injection request.
+ c.runData.requestInterruptWindow = 0
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+ if c.runData.readyForInterruptInjection == 0 {
+ return false
+ }
+
+ if c.runData.ifFlag == 0 {
+ // This is impossible if readyForInterruptInjection is 1.
+ throw("interrupts are disabled")
+ }
+
+ // Disable interrupts if we are in the kernel space.
+ //
+ // When the Sentry switches into the kernel mode, it disables
+ // interrupts. But when goruntime switches on a goroutine which has
+ // been saved in the host mode, it restores flags and this enables
+ // interrupts. See the comment of UserFlagsSet for more details.
+ uregs := userRegs{}
+ err := c.getUserRegisters(&uregs)
+ if err != 0 {
+ throw("failed to get user registers")
+ }
+
+ if ring0.IsKernelFlags(uregs.RFLAGS) {
+ uregs.RFLAGS &^= ring0.KernelFlagsClear
+ err = c.setUserRegisters(&uregs)
+ if err != 0 {
+ throw("failed to set user registers")
+ }
+ return false
+ }
+ return true
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index c215d443c..58f3d6fdd 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -26,8 +26,29 @@ import (
var (
// The action for bluepillSignal is changed by sigaction().
bluepillSignal = syscall.SIGILL
+
+ // vcpuSErr is the event of system error.
+ vcpuSErr = kvmVcpuEvents{
+ exception: exception{
+ sErrPending: 1,
+ sErrHasEsr: 0,
+ pad: [6]uint8{0, 0, 0, 0, 0, 0},
+ sErrEsr: 1,
+ },
+ rsvd: [12]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ }
)
+// getTLS returns the value of TPIDR_EL0 register.
+//
+//go:nosplit
+func getTLS() (value uint64)
+
+// setTLS writes the TPIDR_EL0 value.
+//
+//go:nosplit
+func setTLS(value uint64)
+
// bluepillArchEnter is called during bluepillEnter.
//
//go:nosplit
@@ -38,8 +59,10 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
regs.Sp = context.Sp
regs.Pc = context.Pc
regs.Pstate = context.Pstate
- regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+ regs.Pstate &^= uint64(ring0.PsrFlagsClear)
regs.Pstate |= ring0.KernelFlagsSet
+ regs.TPIDR_EL0 = getTLS()
+
return
}
@@ -52,8 +75,9 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
context.Sp = regs.Sp
context.Pc = regs.Pc
context.Pstate = regs.Pstate
- context.Pstate &^= uint64(ring0.UserFlagsClear)
+ context.Pstate &^= uint64(ring0.PsrFlagsClear)
context.Pstate |= ring0.UserFlagsSet
+ setTLS(regs.TPIDR_EL0)
lazyVfp := c.GetLazyVFP()
if lazyVfp != 0 {
@@ -66,6 +90,8 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
// KernelSyscall handles kernel syscalls.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelSyscall() {
regs := c.Registers()
@@ -88,6 +114,8 @@ func (c *vCPU) KernelSyscall() {
// KernelException handles kernel exceptions.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelException(vector ring0.Vector) {
regs := c.Registers()
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index 04efa0147..09c7e88e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -32,6 +32,18 @@
#define CONTEXT_PC 0x1B8
#define CONTEXT_R0 0xB8
+// getTLS returns the value of TPIDR_EL0 register.
+TEXT ·getTLS(SB),NOSPLIT,$0-8
+ MRS TPIDR_EL0, R1
+ MOVD R1, ret+0(FP)
+ RET
+
+// setTLS writes the TPIDR_EL0 value.
+TEXT ·setTLS(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R1
+ MSR R1, TPIDR_EL0
+ RET
+
// See bluepill.go.
TEXT ·bluepill(SB),NOSPLIT,$0
begin:
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 4ca2b7717..b35c930e2 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,6 +17,7 @@
package kvm
import (
+ "syscall"
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -40,7 +41,7 @@ func fpsimdPtr(addr *byte) *arch.FpsimdContext {
func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
// If the vCPU is in user mode, we set the stack to the stored stack
// value in the vCPU itself. We don't want to unwind the user stack.
- if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+ if guestRegs.Regs.Pstate&ring0.PsrModeMask == ring0.UserFlagsSet {
regs := c.CPU.Registers()
context.Regs[0] = regs.Regs[0]
context.Sp = regs.Sp
@@ -61,3 +62,36 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
return &((*arch.SignalContext64)(context).Fpsimd64)
}
+
+// getHypercallID returns hypercall ID.
+//
+// On Arm64, the MMIO address should be 64-bit aligned.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+ if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) {
+ return _KVM_HYPERCALL_MAX
+ } else {
+ return int(((addr) - arm64HypercallMMIOBase) >> 3)
+ }
+}
+
+// bluepillStopGuest is reponsible for injecting sError.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_VCPU_EVENTS,
+ uintptr(unsafe.Pointer(&vcpuSErr))); errno != 0 {
+ throw("sErr injection failed")
+ }
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+ return true
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e34f46aeb..a182e4f22 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -98,6 +98,10 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi
}
errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags)
if errno == 0 {
+ // Store the physical address in the slot. This is used to
+ // avoid calls to handleBluepillFault in the future (see
+ // machine.mapPhysical).
+ atomic.StoreUintptr(&m.usedSlots[slot], physical)
// Successfully added region; we can increment nextSlot and
// allow another set to proceed here.
atomic.StoreUint32(&m.nextSlot, slot+1)
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 2407014e9..eb05950cd 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.16
+// +build !go1.17
// Check go:linkname function signatures when updating Go version.
@@ -58,20 +58,40 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
return &((*arch.UContext64)(context).MContext)
}
+// bluepillHandleHlt is reponsible for handling VM-Exit.
+//
+//go:nosplit
+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+ // Increment our counter.
+ atomic.AddUint64(&c.guestExits, 1)
+
+ // Copy out registers.
+ bluepillArchExit(c, bluepillArchContext(context))
+
+ // Return to the vCPUReady state; notify any waiters.
+ user := atomic.LoadUint32(&c.state) & vCPUUser
+ switch atomic.SwapUint32(&c.state, user) {
+ case user | vCPUGuest: // Expected case.
+ case user | vCPUGuest | vCPUWaiter:
+ c.notify()
+ default:
+ throw("invalid state")
+ }
+}
+
// bluepillHandler is called from the signal stub.
//
// The world may be stopped while this is executing, and it executes on the
// signal stack. It should only execute raw system calls and functions that are
// explicitly marked go:nosplit.
//
+// +checkescape:all
+//
//go:nosplit
func bluepillHandler(context unsafe.Pointer) {
// Sanitize the registers; interrupts must always be disabled.
c := bluepillArchEnter(bluepillArchContext(context))
- // Increment the number of switches.
- atomic.AddUint32(&c.switches, 1)
-
// Mark this as guest mode.
switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
case vCPUUser: // Expected case.
@@ -82,7 +102,8 @@ func bluepillHandler(context unsafe.Pointer) {
}
for {
- switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+ _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
+ switch errno {
case 0: // Expected case.
case syscall.EINTR:
// First, we process whatever pending signal
@@ -90,7 +111,7 @@ func bluepillHandler(context unsafe.Pointer) {
// currently, all signals are masked and the signal
// must have been delivered directly to this thread.
timeout := syscall.Timespec{}
- sig, _, errno := syscall.RawSyscall6(
+ sig, _, errno := syscall.RawSyscall6( // escapes: no.
syscall.SYS_RT_SIGTIMEDWAIT,
uintptr(unsafe.Pointer(&bounceSignalMask)),
0, // siginfo.
@@ -112,12 +133,12 @@ func bluepillHandler(context unsafe.Pointer) {
// PIC, we can't inject an interrupt while they are
// masked. We need to request a window if it's not
// ready.
- if c.runData.readyForInterruptInjection == 0 {
- c.runData.requestInterruptWindow = 1
- continue // Rerun vCPU.
- } else {
+ if bluepillReadyStopGuest(c) {
// Force injection below; the vCPU is ready.
c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+ } else {
+ c.runData.requestInterruptWindow = 1
+ continue // Rerun vCPU.
}
case syscall.EFAULT:
// If a fault is not serviceable due to the host
@@ -125,7 +146,7 @@ func bluepillHandler(context unsafe.Pointer) {
// MMIO exit we receive EFAULT from the run ioctl. We
// always inject an NMI here since we may be in kernel
// mode and have interrupts disabled.
- if _, _, errno := syscall.RawSyscall(
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_NMI, 0); errno != 0 {
@@ -156,25 +177,20 @@ func bluepillHandler(context unsafe.Pointer) {
c.die(bluepillArchContext(context), "debug")
return
case _KVM_EXIT_HLT:
- // Copy out registers.
- bluepillArchExit(c, bluepillArchContext(context))
-
- // Return to the vCPUReady state; notify any waiters.
- user := atomic.LoadUint32(&c.state) & vCPUUser
- switch atomic.SwapUint32(&c.state, user) {
- case user | vCPUGuest: // Expected case.
- case user | vCPUGuest | vCPUWaiter:
- c.notify()
- default:
- throw("invalid state")
- }
+ bluepillGuestExit(c, context)
return
case _KVM_EXIT_MMIO:
+ physical := uintptr(c.runData.data[0])
+ if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
+ bluepillGuestExit(c, context)
+ return
+ }
+
// Increment the fault count.
atomic.AddUint32(&c.faults, 1)
// For MMIO, the physical address is the first data item.
- physical := uintptr(c.runData.data[0])
+ physical = uintptr(c.runData.data[0])
virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
if !ok {
c.die(bluepillArchContext(context), "invalid physical address")
@@ -201,17 +217,7 @@ func bluepillHandler(context unsafe.Pointer) {
}
}
case _KVM_EXIT_IRQ_WINDOW_OPEN:
- // Interrupt: we must have requested an interrupt
- // window; set the interrupt line.
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_INTERRUPT,
- uintptr(unsafe.Pointer(&bounce))); errno != 0 {
- throw("interrupt injection failed")
- }
- // Clear previous injection request.
- c.runData.requestInterruptWindow = 0
+ bluepillStopGuest(c)
case _KVM_EXIT_SHUTDOWN:
c.die(bluepillArchContext(context), "shutdown")
return
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 6507121ea..17268d127 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,6 +15,9 @@
package kvm
import (
+ "sync/atomic"
+
+ pkgcontext "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
@@ -37,7 +40,8 @@ type context struct {
}
// Switch runs the provided context in the given address space.
-func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ as := mm.AddressSpace()
localAS := as.(*addressSpace)
// Grab a vCPU.
@@ -73,6 +77,9 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
// Clear the address space.
cpu.active.set(nil)
+ // Increment the number of user exits.
+ atomic.AddUint64(&cpu.userExits, 1)
+
// Release resources.
c.machine.Put(cpu)
@@ -88,3 +95,9 @@ func (c *context) Interrupt() {
// Release implements platform.Context.Release().
func (c *context) Release() {}
+
+// FullStateChanged implements platform.Context.FullStateChanged.
+func (c *context) FullStateChanged() {}
+
+// PullFullState implements platform.Context.PullFullState.
+func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
diff --git a/pkg/sentry/platform/kvm/filters_amd64.go b/pkg/sentry/platform/kvm/filters_amd64.go
index 7d949f1dd..d3d216aa5 100644
--- a/pkg/sentry/platform/kvm/filters_amd64.go
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
@@ -17,14 +17,23 @@ package kvm
import (
"syscall"
+ "golang.org/x/sys/unix"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/seccomp"
)
// SyscallFilters returns syscalls made exclusively by the KVM platform.
func (*KVM) SyscallFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
- syscall.SYS_ARCH_PRCTL: {},
- syscall.SYS_IOCTL: {},
+ syscall.SYS_ARCH_PRCTL: {},
+ syscall.SYS_IOCTL: {},
+ unix.SYS_MEMBARRIER: []seccomp.Rule{
+ {
+ seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+ seccomp.EqualTo(0),
+ },
+ },
syscall.SYS_MMAP: {},
syscall.SYS_RT_SIGSUSPEND: {},
syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
index 9245d07c2..21abc2a3d 100644
--- a/pkg/sentry/platform/kvm/filters_arm64.go
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -17,13 +17,22 @@ package kvm
import (
"syscall"
+ "golang.org/x/sys/unix"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/seccomp"
)
// SyscallFilters returns syscalls made exclusively by the KVM platform.
func (*KVM) SyscallFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
- syscall.SYS_IOCTL: {},
+ syscall.SYS_IOCTL: {},
+ unix.SYS_MEMBARRIER: []seccomp.Rule{
+ {
+ seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+ seccomp.EqualTo(0),
+ },
+ },
syscall.SYS_MMAP: {},
syscall.SYS_RT_SIGSUSPEND: {},
syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ae813e24e..dd45ad10b 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -63,6 +63,9 @@ type runData struct {
type KVM struct {
platform.NoCPUPreemptionDetection
+ // KVM never changes mm_structs.
+ platform.UseHostProcessMemoryBarrier
+
// machine is the backing VM.
machine *machine
}
@@ -156,15 +159,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
// Allocate page tables and install system mappings.
pageTables := pagetables.New(newAllocator())
- applyPhysicalRegions(func(pr physicalRegion) bool {
- // Map the kernel in the upper half.
- pageTables.Map(
- usermem.Addr(ring0.KernelStartAddress|pr.virtual),
- pr.length,
- pagetables.MapOpts{AccessType: usermem.AnyAccess},
- pr.physical)
- return true // Keep iterating.
- })
+ k.machine.mapUpperHalf(pageTables)
// Return the new address space.
return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go
new file mode 100644
index 000000000..c0b4fd374
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go
@@ -0,0 +1,51 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+func TestSegments(t *testing.T) {
+ applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
+ testutil.SetTestSegments(regs)
+ for {
+ var si arch.SignalInfo
+ if _, err := c.SwitchToUser(ring0.SwitchOpts{
+ Registers: regs,
+ FloatingPointState: dummyFPState,
+ PageTables: pt,
+ FullRestore: true,
+ }, &si); err == platform.ErrContextInterrupt {
+ continue // Retry.
+ } else if err != nil {
+ t.Errorf("application segment check with full restore got unexpected error: %v", err)
+ }
+ if err := testutil.CheckTestSegments(regs); err != nil {
+ t.Errorf("application segment check with full restore failed: %v", err)
+ }
+ break // Done.
+ }
+ return false
+ })
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 29d457a7e..0b06a923a 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -26,6 +26,9 @@ type kvmOneReg struct {
addr uint64
}
+// arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls.
+var arm64HypercallMMIOBase uintptr
+
const KVM_NR_SPSR = 5
type userFpsimdState struct {
@@ -43,6 +46,18 @@ type userRegs struct {
fpRegs userFpsimdState
}
+type exception struct {
+ sErrPending uint8
+ sErrHasEsr uint8
+ pad [6]uint8
+ sErrEsr uint64
+}
+
+type kvmVcpuEvents struct {
+ exception
+ rsvd [12]uint32
+}
+
// updateGlobalOnce does global initialization. It has to be called only once.
func updateGlobalOnce(fd int) error {
physicalInit()
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go
new file mode 100644
index 000000000..0e3d84d95
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go
@@ -0,0 +1,31 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+)
+
+func TestKernelTLS(t *testing.T) {
+ bluepillTest(t, func(c *vCPU) {
+ if !testutil.TLSWorks() {
+ t.Errorf("tls does not work, and it should!")
+ }
+ })
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
index 6531bae1d..48ccf8474 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -22,7 +22,8 @@ import (
)
var (
- runDataSize int
+ runDataSize int
+ hasGuestPCID bool
)
func updateSystemValues(fd int) error {
@@ -33,6 +34,7 @@ func updateSystemValues(fd int) error {
}
// Save the data.
runDataSize = int(sz)
+ hasGuestPCID = true
// Success.
return nil
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..6abaa21c4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -26,15 +26,21 @@ const (
_KVM_RUN = 0xae80
_KVM_NMI = 0xae9a
_KVM_CHECK_EXTENSION = 0xae03
+ _KVM_GET_TSC_KHZ = 0xaea3
+ _KVM_SET_TSC_KHZ = 0xaea2
_KVM_INTERRUPT = 0x4004ae86
_KVM_SET_MSRS = 0x4008ae89
_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
_KVM_SET_REGS = 0x4090ae82
_KVM_SET_SREGS = 0x4138ae84
+ _KVM_GET_MSRS = 0xc008ae88
_KVM_GET_REGS = 0x8090ae81
+ _KVM_GET_SREGS = 0x8138ae83
_KVM_GET_SUPPORTED_CPUID = 0xc008ae05
_KVM_SET_CPUID2 = 0x4008ae90
_KVM_SET_SIGNAL_MASK = 0x4004ae8b
+ _KVM_GET_VCPU_EVENTS = 0x8040ae9f
+ _KVM_SET_VCPU_EVENTS = 0x4040aea0
)
// KVM exit reasons.
@@ -54,12 +60,16 @@ const (
// KVM capability options.
const (
- _KVM_CAP_MAX_VCPUS = 0x42
- _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
+ _KVM_CAP_MAX_MEMSLOTS = 0x0a
+ _KVM_CAP_MAX_VCPUS = 0x42
+ _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
+ _KVM_CAP_VCPU_EVENTS = 0x29
+ _KVM_CAP_ARM_INJECT_SERROR_ESR = 0x9e
)
// KVM limits.
const (
+ _KVM_NR_MEMSLOTS = 0x100
_KVM_NR_VCPUS = 0xff
_KVM_NR_INTERRUPTS = 0x100
_KVM_NR_CPUID_ENTRIES = 0x100
@@ -71,3 +81,16 @@ const (
_KVM_MEM_READONLY = uint32(1) << 1
_KVM_MEM_FLAGS_NONE = 0
)
+
+// KVM hypercall list.
+//
+// Canonical list of hypercalls supported.
+const (
+ // On amd64, it uses 'HLT' to leave the guest.
+ //
+ // Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
+ //
+ // _KVM_HYPERCALL_VMEXIT is only used on arm64 for now.
+ _KVM_HYPERCALL_VMEXIT int = iota
+ _KVM_HYPERCALL_MAX
+)
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 531ae8b1e..5831b9345 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -38,6 +38,8 @@ const (
_KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
_KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
_KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600
+ _KVM_ARM64_REGS_TIMER_CNT = 0x603000000013df1a
+ _KVM_ARM64_REGS_CNTFRQ_EL0 = 0x603000000013df00
)
// Arm64: Architectural Feature Access Control Register EL1.
@@ -72,6 +74,7 @@ const (
_TCR_T0SZ_VA48 = 64 - 48 // VA=48
_TCR_T1SZ_VA48 = 64 - 48 // VA=48
+ _TCR_A1 = 1 << 22
_TCR_ASID16 = 1 << 36
_TCR_TBI0 = 1 << 37
@@ -100,13 +103,20 @@ const (
// Arm64: Memory Attribute Indirection Register EL1.
const (
- _MT_DEVICE_nGnRnE = 0
- _MT_DEVICE_nGnRE = 1
- _MT_DEVICE_GRE = 2
- _MT_NORMAL_NC = 3
- _MT_NORMAL = 4
- _MT_NORMAL_WT = 5
- _MT_EL1_INIT = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+ _MT_DEVICE_nGnRnE = 0
+ _MT_DEVICE_nGnRE = 1
+ _MT_DEVICE_GRE = 2
+ _MT_NORMAL_NC = 3
+ _MT_NORMAL = 4
+ _MT_NORMAL_WT = 5
+ _MT_ATTR_DEVICE_nGnRnE = 0x00
+ _MT_ATTR_DEVICE_nGnRE = 0x04
+ _MT_ATTR_DEVICE_GRE = 0x0c
+ _MT_ATTR_NORMAL_NC = 0x44
+ _MT_ATTR_NORMAL_WT = 0xbb
+ _MT_ATTR_NORMAL = 0xff
+ _MT_ATTR_MASK = 0xff
+ _MT_EL1_INIT = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8))
)
const (
@@ -116,6 +126,17 @@ const (
// Arm64: Exception Syndrome Register EL1.
const (
+ _ESR_ELx_EC_SHIFT = 26
+ _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT
+
+ _ESR_ELx_EC_IMP_DEF = 0x1f
+ _ESR_ELx_EC_IABT_LOW = 0x20
+ _ESR_ELx_EC_IABT_CUR = 0x21
+ _ESR_ELx_EC_PC_ALIGN = 0x22
+
+ _ESR_ELx_CM = 1 << 8
+ _ESR_ELx_WNR = 1 << 6
+
_ESR_ELx_FSC = 0x3F
_ESR_SEGV_MAPERR_L0 = 0x4
@@ -131,3 +152,10 @@ const (
_ESR_SEGV_PEMERR_L2 = 0xe
_ESR_SEGV_PEMERR_L3 = 0xf
)
+
+// Arm64: MMIO base address used to dispatch hypercalls.
+const (
+ // on Arm64, the MMIO address must be 64-bit aligned.
+ // Currently, we only need 1 hypercall: hypercall_vmexit.
+ _AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3
+)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 6c8f4fa28..e58acc071 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -27,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -262,30 +263,6 @@ func TestRegistersFault(t *testing.T) {
})
}
-func TestSegments(t *testing.T) {
- applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
- testutil.SetTestSegments(regs)
- for {
- var si arch.SignalInfo
- if _, err := c.SwitchToUser(ring0.SwitchOpts{
- Registers: regs,
- FloatingPointState: dummyFPState,
- PageTables: pt,
- FullRestore: true,
- }, &si); err == platform.ErrContextInterrupt {
- continue // Retry.
- } else if err != nil {
- t.Errorf("application segment check with full restore got unexpected error: %v", err)
- }
- if err := testutil.CheckTestSegments(regs); err != nil {
- t.Errorf("application segment check with full restore failed: %v", err)
- }
- break // Done.
- }
- return false
- })
-}
-
func TestBounce(t *testing.T) {
applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
go func() {
@@ -435,9 +412,9 @@ func TestWrongVCPU(t *testing.T) {
// Basic test, one then the other.
bluepill(c1)
bluepill(c2)
- if c2.switches == 0 {
+ if c2.guestExits == 0 {
// Don't allow the test to proceed if this fails.
- t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
// Alternate vCPUs; we expect to need to trigger the
@@ -446,11 +423,11 @@ func TestWrongVCPU(t *testing.T) {
bluepill(c1)
bluepill(c2)
}
- if count := c1.switches; count < 90 {
- t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ if count := c1.guestExits; count < 90 {
+ t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
- if count := c2.switches; count < 90 {
- t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ if count := c2.guestExits; count < 90 {
+ t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
}
return false
})
@@ -466,6 +443,22 @@ func TestWrongVCPU(t *testing.T) {
})
}
+func TestRdtsc(t *testing.T) {
+ var i int // Iteration count.
+ kvmTest(t, nil, func(c *vCPU) bool {
+ start := ktime.Rdtsc()
+ bluepill(c)
+ guest := ktime.Rdtsc()
+ redpill()
+ end := ktime.Rdtsc()
+ if start > guest || guest > end {
+ t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end)
+ }
+ i++
+ return i < 100
+ })
+}
+
func BenchmarkApplicationSyscall(b *testing.B) {
var (
i int // Iteration includes machine.Get() / machine.Put().
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..f70d761fd 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -43,25 +44,31 @@ type machine struct {
// kernel is the set of global structures.
kernel ring0.Kernel
- // mappingCache is used for mapPhysical.
- mappingCache sync.Map
-
// mu protects vCPUs.
mu sync.RWMutex
// available is notified when vCPUs are available.
available sync.Cond
- // vCPUs are the machine vCPUs.
+ // vCPUsByTID are the machine vCPUs.
//
// These are populated dynamically.
- vCPUs map[uint64]*vCPU
+ vCPUsByTID map[uint64]*vCPU
// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
- vCPUsByID map[int]*vCPU
+ vCPUsByID []*vCPU
// maxVCPUs is the maximum number of vCPUs supported by the machine.
maxVCPUs int
+
+ // maxSlots is the maximum number of memory slots supported by the machine.
+ maxSlots int
+
+ // usedSlots is the set of used physical addresses (sorted).
+ usedSlots []uintptr
+
+ // nextID is the next vCPU ID.
+ nextID uint32
}
const (
@@ -97,8 +104,11 @@ type vCPU struct {
// tid is the last set tid.
tid uint64
- // switches is a count of world switches (informational only).
- switches uint32
+ // userExits is the count of user exits.
+ userExits uint64
+
+ // guestExits is the count of guest to host world switches.
+ guestExits uint64
// faults is a count of world faults (informational only).
faults uint32
@@ -121,6 +131,7 @@ type vCPU struct {
// vCPUArchState is the architecture-specific state.
vCPUArchState
+ // dieState holds state related to vCPU death.
dieState dieState
}
@@ -137,9 +148,8 @@ type dieState struct {
//
// Precondition: mu must be held.
func (m *machine) newVCPU() *vCPU {
- id := len(m.vCPUs)
-
// Create the vCPU.
+ id := int(atomic.AddUint32(&m.nextID, 1) - 1)
fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
if errno != 0 {
panic(fmt.Sprintf("error creating new vCPU: %v", errno))
@@ -150,7 +160,7 @@ func (m *machine) newVCPU() *vCPU {
fd: int(fd),
machine: m,
}
- c.CPU.Init(&m.kernel, c)
+ c.CPU.Init(&m.kernel, c.id, c)
m.vCPUsByID[c.id] = c
// Ensure the signal mask is correct.
@@ -176,16 +186,10 @@ func (m *machine) newVCPU() *vCPU {
// newMachine returns a new VM context.
func newMachine(vm int) (*machine, error) {
// Create the machine.
- m := &machine{
- fd: vm,
- vCPUs: make(map[uint64]*vCPU),
- vCPUsByID: make(map[int]*vCPU),
- }
+ m := &machine{fd: vm}
m.available.L = &m.mu
- m.kernel.Init(ring0.KernelOpts{
- PageTables: pagetables.New(newAllocator()),
- })
+ // Pull the maximum vCPUs.
maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
if errno != 0 {
m.maxVCPUs = _KVM_NR_VCPUS
@@ -193,6 +197,21 @@ func newMachine(vm int) (*machine, error) {
m.maxVCPUs = int(maxVCPUs)
}
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+ m.vCPUsByTID = make(map[uint64]*vCPU)
+ m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+ m.kernel.Init(ring0.KernelOpts{
+ PageTables: pagetables.New(newAllocator()),
+ }, m.maxVCPUs)
+
+ // Pull the maximum slots.
+ maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
+ if errno != 0 {
+ m.maxSlots = _KVM_NR_MEMSLOTS
+ } else {
+ m.maxSlots = int(maxSlots)
+ }
+ log.Debugf("The maximum number of slots is %d.", m.maxSlots)
+ m.usedSlots = make([]uintptr, m.maxSlots)
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
@@ -205,15 +224,9 @@ func newMachine(vm int) (*machine, error) {
pagetables.MapOpts{AccessType: usermem.AnyAccess},
pr.physical)
- // And keep everything in the upper half.
- m.kernel.PageTables.Map(
- usermem.Addr(ring0.KernelStartAddress|pr.virtual),
- pr.length,
- pagetables.MapOpts{AccessType: usermem.AnyAccess},
- pr.physical)
-
return true // Keep iterating.
})
+ m.mapUpperHalf(m.kernel.PageTables)
var physicalRegionsReadOnly []physicalRegion
var physicalRegionsAvailable []physicalRegion
@@ -270,10 +283,26 @@ func newMachine(vm int) (*machine, error) {
return m, nil
}
+// hasSlot returns true iff the given address is mapped.
+//
+// This must be done via a linear scan.
+//
+//go:nosplit
+func (m *machine) hasSlot(physical uintptr) bool {
+ for i := 0; i < len(m.usedSlots); i++ {
+ if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
+ return true
+ }
+ }
+ return false
+}
+
// mapPhysical checks for the mapping of a physical range, and installs one if
// not available. This attempts to be efficient for calls in the hot path.
//
// This panics on error.
+//
+//go:nosplit
func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
for end := physical + length; physical < end; {
_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
@@ -282,8 +311,8 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
panic("mapPhysical on unknown physical address")
}
- if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
- // Not present in the cache; requires setting the slot.
+ // Is this already mapped? Check the usedSlots.
+ if !m.hasSlot(physicalStart) {
if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
panic("handleBluepillFault failed")
}
@@ -304,7 +333,11 @@ func (m *machine) Destroy() {
runtime.SetFinalizer(m, nil)
// Destroy vCPUs.
- for _, c := range m.vCPUs {
+ for _, c := range m.vCPUsByID {
+ if c == nil {
+ continue
+ }
+
// Ensure the vCPU is not still running in guest mode. This is
// possible iff teardown has been done by other threads, and
// somehow a single thread has not executed any system calls.
@@ -331,13 +364,18 @@ func (m *machine) Destroy() {
// Get gets an available vCPU.
//
// This will return with the OS thread locked.
+//
+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
+// to the vCPU in which the OS thread TID is running. So if Get() returns with
+// the corrent context in guest, the vCPU of it must be the same as what
+// Get() returns.
func (m *machine) Get() *vCPU {
m.mu.RLock()
runtime.LockOSThread()
tid := procid.Current()
// Check for an exact match.
- if c := m.vCPUs[tid]; c != nil {
+ if c := m.vCPUsByTID[tid]; c != nil {
c.lock()
m.mu.RUnlock()
return c
@@ -356,7 +394,7 @@ func (m *machine) Get() *vCPU {
tid = procid.Current()
// Recheck for an exact match.
- if c := m.vCPUs[tid]; c != nil {
+ if c := m.vCPUsByTID[tid]; c != nil {
c.lock()
m.mu.Unlock()
return c
@@ -364,10 +402,10 @@ func (m *machine) Get() *vCPU {
for {
// Scan for an available vCPU.
- for origTID, c := range m.vCPUs {
+ for origTID, c := range m.vCPUsByTID {
if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
- delete(m.vCPUs, origTID)
- m.vCPUs[tid] = c
+ delete(m.vCPUsByTID, origTID)
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
@@ -375,17 +413,17 @@ func (m *machine) Get() *vCPU {
}
// Create a new vCPU (maybe).
- if len(m.vCPUs) < m.maxVCPUs {
+ if int(m.nextID) < m.maxVCPUs {
c := m.newVCPU()
c.lock()
- m.vCPUs[tid] = c
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
}
// Scan for something not in user mode.
- for origTID, c := range m.vCPUs {
+ for origTID, c := range m.vCPUsByTID {
if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
continue
}
@@ -403,8 +441,8 @@ func (m *machine) Get() *vCPU {
}
// Steal the vCPU.
- delete(m.vCPUs, origTID)
- m.vCPUs[tid] = c
+ delete(m.vCPUsByTID, origTID)
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
@@ -431,7 +469,20 @@ func (m *machine) Put(c *vCPU) {
// newDirtySet returns a new dirty set.
func (m *machine) newDirtySet() *dirtySet {
return &dirtySet{
- vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+ vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+ }
+}
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
+ // Clear from all PCIDs.
+ for _, c := range m.vCPUsByID {
+ if c != nil && c.PCIDs != nil {
+ c.PCIDs.Drop(pt)
+ }
}
}
@@ -494,6 +545,8 @@ var pid = syscall.Getpid()
//
// This effectively unwinds the state machine.
func (c *vCPU) bounce(forceGuestExit bool) {
+ origGuestExits := atomic.LoadUint64(&c.guestExits)
+ origUserExits := atomic.LoadUint64(&c.userExits)
for {
switch state := atomic.LoadUint32(&c.state); state {
case vCPUReady, vCPUWaiter:
@@ -549,6 +602,14 @@ func (c *vCPU) bounce(forceGuestExit bool) {
// Should not happen: the above is exhaustive.
panic("invalid state")
}
+
+ // Check if we've missed the state transition, but
+ // we can safely return at this point in time.
+ newGuestExits := atomic.LoadUint64(&c.guestExits)
+ newUserExits := atomic.LoadUint64(&c.userExits)
+ if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
+ return
+ }
}
}
@@ -565,3 +626,35 @@ func (c *vCPU) BounceToKernel() {
func (c *vCPU) BounceToHost() {
c.bounce(true)
}
+
+// setSystemTimeLegacy calibrates and sets an approximate system time.
+func (c *vCPU) setSystemTimeLegacy() error {
+ const minIterations = 10
+ minimum := uint64(0)
+ for iter := 0; ; iter++ {
+ // Try to set the TSC to an estimate of where it will be
+ // on the host during a "fast" system call iteration.
+ start := uint64(ktime.Rdtsc())
+ if err := c.setTSC(start + (minimum / 2)); err != nil {
+ return err
+ }
+ // See if this is our new minimum call time. Note that this
+ // serves two functions: one, we make sure that we are
+ // accurately predicting the offset we need to set. Second, we
+ // don't want to do the final set on a slow call, which could
+ // produce a really bad result.
+ end := uint64(ktime.Rdtsc())
+ if end < start {
+ continue // Totally bogus: unstable TSC?
+ }
+ current := end - start
+ if current < minimum || iter == 0 {
+ minimum = current // Set our new minimum.
+ }
+ // Is this past minIterations and within ~10% of minimum?
+ upperThreshold := (((minimum << 3) + minimum) >> 3)
+ if iter >= minIterations && current <= upperThreshold {
+ return nil
+ }
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 923ce3909..a8b729e62 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -18,14 +18,17 @@ package kvm
import (
"fmt"
+ "math/big"
"reflect"
"runtime/debug"
"syscall"
+ "gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -51,9 +54,10 @@ func (m *machine) initArchState() error {
recover()
debug.SetPanicOnFault(old)
}()
- m.retryInGuest(func() {
- ring0.SetCPUIDFaulting(true)
- })
+ c := m.Get()
+ defer m.Put(c)
+ bluepill(c)
+ ring0.SetCPUIDFaulting(true)
return nil
}
@@ -83,19 +87,6 @@ const (
poolPCIDs = 8
)
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
- m.mu.Lock()
- defer m.mu.Unlock()
-
- // Clear from all PCIDs.
- for _, c := range m.vCPUs {
- if c.PCIDs != nil {
- c.PCIDs.Drop(pt)
- }
- }
-}
-
// initArchState initializes architecture-specific state.
func (c *vCPU) initArchState() error {
var (
@@ -143,6 +134,7 @@ func (c *vCPU) initArchState() error {
// Set the entrypoint for the kernel.
kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+ kernelUserRegs.RSP = c.StackTop()
kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
// Set the system registers.
@@ -151,8 +143,8 @@ func (c *vCPU) initArchState() error {
}
// Set the user registers.
- if err := c.setUserRegisters(&kernelUserRegs); err != nil {
- return err
+ if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 {
+ return fmt.Errorf("error setting user registers: %v", errno)
}
// Allocate some floating point state save area for the local vCPU.
@@ -165,6 +157,101 @@ func (c *vCPU) initArchState() error {
return c.setSystemTime()
}
+// bitsForScaling returns the bits available for storing the fraction component
+// of the TSC scaling ratio. This allows us to replicate the (bad) math done by
+// the kernel below in scaledTSC, and ensure we can compute an exact zero
+// offset in setSystemTime.
+//
+// These constants correspond to kvm_tsc_scaling_ratio_frac_bits.
+var bitsForScaling = func() int64 {
+ fs := cpuid.HostFeatureSet()
+ if fs.Intel() {
+ return 48 // See vmx.c (kvm sources).
+ } else if fs.AMD() {
+ return 32 // See svm.c (svm sources).
+ } else {
+ return 63 // Unknown: theoretical maximum.
+ }
+}()
+
+// scaledTSC returns the host TSC scaled by the given frequency.
+//
+// This assumes a current frequency of 1. We require only the unitless ratio of
+// rawFreq to some current frequency. See setSystemTime for context.
+//
+// The kernel math guarantees that all bits of the multiplication and division
+// will be correctly preserved and applied. However, it is not possible to
+// actually store the ratio correctly. So we need to use the same schema in
+// order to calculate the scaled frequency and get the same result.
+//
+// We can assume that the current frequency is (1), so we are calculating a
+// strict inverse of this value. This simplifies this function considerably.
+//
+// Roughly, the returned value "scaledTSC" will have:
+// scaledTSC/hostTSC == 1/rawFreq
+//
+//go:nosplit
+func scaledTSC(rawFreq uintptr) int64 {
+ scale := int64(1 << bitsForScaling)
+ ratio := big.NewInt(scale / int64(rawFreq))
+ ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc())))
+ ratio.Div(ratio, big.NewInt(scale))
+ return ratio.Int64()
+}
+
+// setSystemTime sets the vCPU to the system time.
+func (c *vCPU) setSystemTime() error {
+ // First, scale down the clock frequency to the lowest value allowed by
+ // the API itself. How low we can go depends on the underlying
+ // hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD.
+ // Even the lower bound here will take a 4GHz frequency down to 1Hz,
+ // meaning that everything should be able to handle a Khz setting of 1
+ // with bits to spare.
+ //
+ // Note that reducing the clock does not typically require special
+ // capabilities as it is emulated in KVM. We don't actually use this
+ // capability, but it means that this method should be robust to
+ // different hardware configurations.
+ rawFreq, err := c.getTSCFreq()
+ if err != nil {
+ return c.setSystemTimeLegacy()
+ }
+ if err := c.setTSCFreq(1); err != nil {
+ return c.setSystemTimeLegacy()
+ }
+
+ // Always restore the original frequency.
+ defer func() {
+ if err := c.setTSCFreq(rawFreq); err != nil {
+ panic(err.Error())
+ }
+ }()
+
+ // Attempt to set the system time in this compressed world. The
+ // calculation for offset normally looks like:
+ //
+ // offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc());
+ //
+ // So as long as the kvm_scale_tsc component is constant before and
+ // after the call to set the TSC value (and it is passes as the
+ // target_tsc), we will compute an offset value of zero.
+ //
+ // This is effectively cheating to make our "setSystemTime" call so
+ // unbelievably, incredibly fast that we do it "instantly" and all the
+ // calculations result in an offset of zero.
+ lastTSC := scaledTSC(rawFreq)
+ for {
+ if err := c.setTSC(uint64(lastTSC)); err != nil {
+ return err
+ }
+ nextTSC := scaledTSC(rawFreq)
+ if lastTSC == nextTSC {
+ return nil
+ }
+ lastTSC = nextTSC // Try again.
+ }
+}
+
// nonCanonical generates a canonical address return.
//
//go:nosplit
@@ -335,29 +422,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
}
}
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
- c := m.Get()
- defer m.Put(c)
- for {
- c.ClearErrorCode() // See below.
- bluepill(c) // Force guest mode.
- fn() // Execute the given function.
- _, user := c.ErrorCode()
- if user {
- // If user is set, then we haven't bailed back to host
- // mode via a kernel exception or system call. We
- // consider the full function to have executed in guest
- // mode and we can return.
- break
- }
- }
-}
-
// On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
// There is no need to return read-only physicalRegions.
func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
@@ -367,3 +431,41 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return physicalRegions
}
+
+var execRegions = func() (regions []region) {
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
+ return
+ }
+ if vr.accessType.Execute {
+ regions = append(regions, vr.region)
+ }
+ })
+ return
+}()
+
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ for _, r := range execRegions {
+ physical, length, ok := translateToPhysical(r.virtual)
+ if !ok || length < r.length {
+ panic("impossilbe translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|r.virtual),
+ r.length,
+ pagetables.MapOpts{AccessType: usermem.Execute},
+ physical)
+ }
+ for start, end := range m.kernel.EntryRegions() {
+ regionLen := end - start
+ physical, length, ok := translateToPhysical(start)
+ if !ok || length < regionLen {
+ panic("impossible translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|start),
+ regionLen,
+ pagetables.MapOpts{AccessType: usermem.ReadWrite},
+ physical)
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..b430f92c6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -23,7 +23,6 @@ import (
"unsafe"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/time"
)
// loadSegments copies the current segments.
@@ -61,91 +60,63 @@ func (c *vCPU) setCPUID() error {
return nil
}
-// setSystemTime sets the TSC for the vCPU.
+// getTSCFreq gets the TSC frequency.
//
-// This has to make the call many times in order to minimize the intrinsic
-// error in the offset. Unfortunately KVM does not expose a relative offset via
-// the API, so this is an approximation. We do this via an iterative algorithm.
-// This has the advantage that it can generally deal with highly variable
-// system call times and should converge on the correct offset.
-func (c *vCPU) setSystemTime() error {
- const (
- _MSR_IA32_TSC = 0x00000010
- calibrateTries = 10
- )
- registers := modelControlRegisters{
- nmsrs: 1,
- }
- registers.entries[0] = modelControlRegister{
- index: _MSR_IA32_TSC,
+// If mustSucceed is true, then this function panics on error.
+func (c *vCPU) getTSCFreq() (uintptr, error) {
+ rawFreq, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_GET_TSC_KHZ,
+ 0 /* ignored */)
+ if errno != 0 {
+ return 0, errno
}
- target := uint64(^uint32(0))
- for done := 0; done < calibrateTries; {
- start := uint64(time.Rdtsc())
- registers.entries[0].data = start + target
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_MSRS,
- uintptr(unsafe.Pointer(&registers))); errno != 0 {
- return fmt.Errorf("error setting system time: %v", errno)
- }
- // See if this is our new minimum call time. Note that this
- // serves two functions: one, we make sure that we are
- // accurately predicting the offset we need to set. Second, we
- // don't want to do the final set on a slow call, which could
- // produce a really bad result. So we only count attempts
- // within +/- 6.25% of our minimum as an attempt.
- end := uint64(time.Rdtsc())
- if end < start {
- continue // Totally bogus.
- }
- half := (end - start) / 2
- if half < target {
- target = half
- }
- if (half - target) < target/8 {
- done++
- }
+ return rawFreq, nil
+}
+
+// setTSCFreq sets the TSC frequency.
+func (c *vCPU) setTSCFreq(freq uintptr) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_TSC_KHZ,
+ freq /* khz */); errno != 0 {
+ return fmt.Errorf("error setting TSC frequency: %v", errno)
}
return nil
}
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
- // The layout of this structure implies that it will not necessarily be
- // the same layout chosen by the Go compiler. It gets fudged here.
- var data struct {
- length uint32
- mask1 uint32
- mask2 uint32
- _ uint32
+// setTSC sets the TSC value.
+func (c *vCPU) setTSC(value uint64) error {
+ const _MSR_IA32_TSC = 0x00000010
+ registers := modelControlRegisters{
+ nmsrs: 1,
}
- data.length = 8 // Fixed sigset size.
- data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
- data.mask2 = ^uint32(bounceSignalMask >> 32)
+ registers.entries[0].index = _MSR_IA32_TSC
+ registers.entries[0].data = value
if _, _, errno := syscall.RawSyscall(
syscall.SYS_IOCTL,
uintptr(c.fd),
- _KVM_SET_SIGNAL_MASK,
- uintptr(unsafe.Pointer(&data))); errno != 0 {
- return fmt.Errorf("error setting signal mask: %v", errno)
+ _KVM_SET_MSRS,
+ uintptr(unsafe.Pointer(&registers))); errno != 0 {
+ return fmt.Errorf("error setting tsc: %v", errno)
}
return nil
}
// setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+//
+//go:nosplit
+func (c *vCPU) setUserRegisters(uregs *userRegs) syscall.Errno {
if _, _, errno := syscall.RawSyscall(
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_SET_REGS,
uintptr(unsafe.Pointer(uregs))); errno != 0 {
- return fmt.Errorf("error setting user registers: %v", errno)
+ return errno
}
- return nil
+ return 0
}
// getUserRegisters reloads user registers in the vCPU.
@@ -154,7 +125,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error {
//
//go:nosplit
func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
- if _, _, errno := syscall.RawSyscall(
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_GET_REGS,
@@ -175,3 +146,17 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
}
return nil
}
+
+// getSystemRegisters sets system registers.
+//
+//go:nosplit
+func (c *vCPU) getSystemRegisters(sregs *systemRegs) syscall.Errno {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_GET_SREGS,
+ uintptr(unsafe.Pointer(sregs))); errno != 0 {
+ return errno
+ }
+ return 0
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 750751aa3..54837f20c 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -19,6 +19,7 @@ package kvm
import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -48,6 +49,18 @@ const (
poolPCIDs = 8
)
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+ pr.length,
+ pagetables.MapOpts{AccessType: usermem.AnyAccess},
+ pr.physical)
+
+ return true // Keep iterating.
+ })
+}
+
// Get all read-only physicalRegions.
func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
var rdonlyRegions []region
@@ -100,19 +113,6 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return phyRegions
}
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
- m.mu.Lock()
- defer m.mu.Unlock()
-
- // Clear from all PCIDs.
- for _, c := range m.vCPUs {
- if c.PCIDs != nil {
- c.PCIDs.Drop(pt)
- }
- }
-}
-
// nonCanonical generates a canonical address return.
//
//go:nosplit
@@ -125,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc
return usermem.NoAccess, platform.ErrContextSignal
}
+// isInstructionAbort returns true if it is an instruction abort.
+//
+//go:nosplit
+func isInstructionAbort(code uint64) bool {
+ value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT
+ return value == _ESR_ELx_EC_IABT_LOW
+}
+
+// isWriteFault returns whether it is a write fault.
+//
+//go:nosplit
+func isWriteFault(code uint64) bool {
+ if isInstructionAbort(code) {
+ return false
+ }
+
+ return (code & _ESR_ELx_WNR) != 0
+}
+
// fault generates an appropriate fault return.
//
//go:nosplit
func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+ bluepill(c) // Probably no-op, but may not be.
faultAddr := c.GetFaultAddr()
code, user := c.ErrorCode()
+ if !user {
+ // The last fault serviced by this CPU was not a user
+ // fault, so we can't reliably trust the faultAddr or
+ // the code provided here. We need to re-execute.
+ return usermem.NoAccess, platform.ErrContextInterrupt
+ }
+
// Reset the pointed SignalInfo.
*info = arch.SignalInfo{Signo: signal}
info.SetAddr(uint64(faultAddr))
- read := true
- write := false
- execute := true
-
ret := code & _ESR_ELx_FSC
switch ret {
case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
info.Code = 1 //SEGV_MAPERR
- read = false
- write = true
- execute = false
case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
info.Code = 2 // SEGV_ACCERR.
- read = true
- write = false
- execute = false
default:
info.Code = 2
}
- if !user {
- read = true
- write = false
- execute = true
-
- }
accessType := usermem.AccessType{
- Read: read,
- Write: write,
- Execute: execute,
+ Read: !isWriteFault(uint64(code)),
+ Write: isWriteFault(uint64(code)),
+ Execute: isInstructionAbort(uint64(code)),
}
return accessType, platform.ErrContextSignal
}
-
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
- c := m.Get()
- defer m.Put(c)
- for {
- c.ClearErrorCode() // See below.
- bluepill(c) // Force guest mode.
- fn() // Execute the given function.
- _, user := c.ErrorCode()
- if user {
- // If user is set, then we haven't bailed back to host
- // mode via a kernel exception or system call. We
- // consider the full function to have executed in guest
- // mode and we can return.
- break
- }
- }
-}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 3c02cef7c..1344ed3c9 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -60,7 +61,6 @@ func (c *vCPU) initArchState() error {
reg.addr = uint64(reflect.ValueOf(&data).Pointer())
regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
- vcpuInit.target = _KVM_ARM_TARGET_GENERIC_V8
vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
if _, _, errno := syscall.RawSyscall(
syscall.SYS_IOCTL,
@@ -78,21 +78,8 @@ func (c *vCPU) initArchState() error {
return err
}
- // sctlr_el1
- regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
- if err := c.getOneRegister(&regGet); err != nil {
- return err
- }
-
- dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
- data = dataGet
- reg.id = _KVM_ARM64_REGS_SCTLR_EL1
- if err := c.setOneRegister(&reg); err != nil {
- return err
- }
-
// tcr_el1
- data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+ data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS | _TCR_A1
reg.id = _KVM_ARM64_REGS_TCR_EL1
if err := c.setOneRegister(&reg); err != nil {
return err
@@ -116,7 +103,7 @@ func (c *vCPU) initArchState() error {
c.SetTtbr0Kvm(uintptr(data))
// ttbr1_el1
- data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+ data = c.machine.kernel.PageTables.TTBR1_EL1(false, 1)
reg.id = _KVM_ARM64_REGS_TTBR1_EL1
if err := c.setOneRegister(&reg); err != nil {
@@ -159,16 +146,46 @@ func (c *vCPU) initArchState() error {
return err
}
- data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
- reg.id = _KVM_ARM64_REGS_PSTATE
+ // Use the address of the exception vector table as
+ // the MMIO address base.
+ arm64HypercallMMIOBase = toLocation
+
+ // Initialize the PCID database.
+ if hasGuestPCID {
+ // Note that NewPCIDs may return a nil table here, in which
+ // case we simply don't use PCID support (see below). In
+ // practice, this should not happen, however.
+ c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
+ }
+
+ c.floatingPointState = arch.NewFloatingPointData()
+
+ return c.setSystemTime()
+}
+
+// setTSC sets the counter Virtual Offset.
+func (c *vCPU) setTSC(value uint64) error {
+ var (
+ reg kvmOneReg
+ data uint64
+ )
+
+ reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+ reg.id = _KVM_ARM64_REGS_TIMER_CNT
+ data = uint64(value)
+
if err := c.setOneRegister(&reg); err != nil {
return err
}
- c.floatingPointState = arch.NewFloatingPointData()
return nil
}
+// setSystemTime sets the vCPU to the system time.
+func (c *vCPU) setSystemTime() error {
+ return c.setSystemTimeLegacy()
+}
+
//go:nosplit
func (c *vCPU) loadSegments(tid uint64) {
// TODO(gvisor.dev/issue/1238): TLS is not supported.
@@ -198,42 +215,6 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
return nil
}
-// setCPUID sets the CPUID to be used by the guest.
-func (c *vCPU) setCPUID() error {
- return nil
-}
-
-// setSystemTime sets the TSC for the vCPU.
-func (c *vCPU) setSystemTime() error {
- return nil
-}
-
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
- // The layout of this structure implies that it will not necessarily be
- // the same layout chosen by the Go compiler. It gets fudged here.
- var data struct {
- length uint32
- mask1 uint32
- mask2 uint32
- _ uint32
- }
- data.length = 8 // Fixed sigset size.
- data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
- data.mask2 = ^uint32(bounceSignalMask >> 32)
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_SIGNAL_MASK,
- uintptr(unsafe.Pointer(&data))); errno != 0 {
- return fmt.Errorf("error setting signal mask: %v", errno)
- }
-
- return nil
-}
-
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
// Check for canonical addresses.
@@ -243,6 +224,13 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
}
+ // Assign PCIDs.
+ if c.PCIDs != nil {
+ var requireFlushPCID bool // Force a flush?
+ switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+ switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+ }
+
var vector ring0.Vector
ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
c.SetTtbr0App(uintptr(ttbr0App))
@@ -269,10 +257,19 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
case ring0.PageFault:
return c.fault(int32(syscall.SIGSEGV), info)
- case 0xaa:
- return usermem.NoAccess, nil
+ case ring0.Vector(bounce): // ring0.VirtualizationException
+ return usermem.NoAccess, platform.ErrContextInterrupt
+ case ring0.El0Sync_undef:
+ return c.fault(int32(syscall.SIGILL), info)
+ case ring0.El1Sync_undef:
+ *info = arch.SignalInfo{
+ Signo: int32(syscall.SIGILL),
+ Code: 1, // ILL_ILLOPC (illegal opcode).
+ }
+ info.SetAddr(switchOpts.Registers.Pc) // Include address.
+ return usermem.AccessType{}, platform.ErrContextSignal
default:
- return usermem.NoAccess, platform.ErrContextSignal
+ panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
}
}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index de7df4f80..1d6ca245a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.16
+// +build !go1.17
// Check go:linkname function signatures when updating Go version.
@@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
//
//go:nosplit
func (c *vCPU) notify() {
- _, _, errno := syscall.RawSyscall6(
+ _, _, errno := syscall.RawSyscall6( // escapes: no.
syscall.SYS_FUTEX,
uintptr(unsafe.Pointer(&c.state)),
linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
@@ -143,3 +143,29 @@ func (c *vCPU) waitUntilNot(state uint32) {
panic("futex wait error")
}
}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+ // The layout of this structure implies that it will not necessarily be
+ // the same layout chosen by the Go compiler. It gets fudged here.
+ var data struct {
+ length uint32
+ mask1 uint32
+ mask2 uint32
+ _ uint32
+ }
+ data.length = 8 // Fixed sigset size.
+ data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+ data.mask2 = ^uint32(bounceSignalMask >> 32)
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SIGNAL_MASK,
+ uintptr(unsafe.Pointer(&data))); errno != 0 {
+ return fmt.Errorf("error setting signal mask: %v", errno)
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index ca902c8c1..c5235ca9d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -23,6 +23,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
)
+// TLSWorks is a tls test.
+//
+// It returns true or false.
+func TLSWorks() bool
+
// SetTestTarget sets the rip appropriately.
func SetTestTarget(regs *arch.Registers, fn func()) {
regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
@@ -56,5 +61,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need)
}
}
+ // Check tls.
+ if need := ^uint64(11); regs.TPIDR_EL0 != need {
+ err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need)
+ }
return
}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 0bebee852..7348c29a5 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,8 +50,26 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
start:
B start
+TEXT ·TLSWorks(SB),NOSPLIT,$0-8
+ NO_LOCAL_POINTERS
+ MOVD $0x6789, R5
+ MSR R5, TPIDR_EL0
+ MOVD $SYS_GETPID, R8 // getpid
+ SVC
+ MRS TPIDR_EL0, R6
+ CMP R5, R6
+ BNE isNaN
+ MOVD $1, R0
+ MOVD R0, ret+0(FP)
+ RET
+isNaN:
+ MOVD $0, ret+0(FP)
+ RET
+
TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
NO_LOCAL_POINTERS
+ // gc will touch fpsimd, so we should test it.
+ // such as in <runtime.deductSweepCredit>.
FMOVD $(9.9), F0
MOVD $SYS_GETPID, R8 // getpid
SVC
@@ -102,5 +120,15 @@ isNaN:
TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0
TWIDDLE_REGS()
+ MSR R10, TPIDR_EL0
+ // Trapped in el0_svc.
SVC
RET // never reached
+
+TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0
+ TWIDDLE_REGS()
+ MSR R10, TPIDR_EL0
+ // Trapped in el0_ia.
+ // Branch to Register branches unconditionally to an address in <Rn>.
+ JMP (R6) // <=> br x6, must fault
+ RET // never reached
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index c8897d34f..4dcdbf8a7 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -34,7 +34,7 @@ type virtualRegion struct {
}
// mapsLine matches a single line from /proc/PID/maps.
-var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
// excludeRegion returns true if these regions should be excluded from the
// physical map. Virtual regions need to be excluded if get_user_pages will
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 171513f3f..dcfe839a7 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -22,9 +22,11 @@ import (
"os"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/hostmm"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -51,6 +53,10 @@ type Platform interface {
// can reliably return ErrContextCPUPreempted.
DetectsCPUPreemption() bool
+ // HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
+ // is supported.
+ HaveGlobalMemoryBarrier() bool
+
// MapUnit returns the alignment used for optional mappings into this
// platform's AddressSpaces. Higher values indicate lower per-page costs
// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
@@ -96,6 +102,15 @@ type Platform interface {
// called.
PreemptAllCPUs() error
+ // GlobalMemoryBarrier blocks until all threads running application code
+ // (via Context.Switch) and all task goroutines "have passed through a
+ // state where all memory accesses to user-space addresses match program
+ // order between entry to and return from [GlobalMemoryBarrier]", as for
+ // membarrier(2).
+ //
+ // Preconditions: HaveGlobalMemoryBarrier() == true.
+ GlobalMemoryBarrier() error
+
// SyscallFilters returns syscalls made exclusively by this platform.
SyscallFilters() seccomp.SyscallRules
}
@@ -114,6 +129,54 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
panic("This platform does not support CPU preemption detection")
}
+// UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
+// host.
+type UseHostGlobalMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+ return hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
+ return hostmm.GlobalMemoryBarrier()
+}
+
+// UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
+// This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
+// platforms for which application code executes while using the sentry's
+// mm_struct.
+type UseHostProcessMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+ // Fall back to a global memory barrier if a process-local one isn't
+ // available.
+ return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
+ if hostmm.HaveProcessMemoryBarrier() {
+ return hostmm.ProcessMemoryBarrier()
+ }
+ return hostmm.GlobalMemoryBarrier()
+}
+
+// MemoryManager represents an abstraction above the platform address space
+// which manages memory mappings and their contents.
+type MemoryManager interface {
+ //usermem.IO provides access to the contents of a virtual memory space.
+ usermem.IO
+ // MMap establishes a memory mapping.
+ MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error)
+ // AddressSpace returns the AddressSpace bound to mm.
+ AddressSpace() AddressSpace
+}
+
// Context represents the execution context for a single thread.
type Context interface {
// Switch resumes execution of the thread specified by the arch.Context
@@ -143,7 +206,36 @@ type Context interface {
// concurrent call to Switch().
//
// - ErrContextCPUPreempted: See the definition of that error for details.
- Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+ Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+
+ // PullFullState() pulls a full state of the application thread.
+ //
+ // A platform can support lazy loading/restoring of a thread state
+ // which includes registers and a floating point state.
+ //
+ // For example, when the Sentry handles a system call, it may have only
+ // syscall arguments without other registers and a floating point
+ // state. And in this case, if the Sentry will need to construct a
+ // signal frame to call a signal handler, it will need to call
+ // PullFullState() to load all registers and FPU state.
+ //
+ // Preconditions: The caller must be running on the task goroutine.
+ PullFullState(as AddressSpace, ac arch.Context)
+
+ // FullStateChanged() indicates that a thread state has been changed by
+ // the Sentry. This happens in case of the rt_sigreturn, execve, etc.
+ //
+ // First, it indicates that the Sentry has the full state of the thread
+ // and PullFullState() has to do nothing if it is called after
+ // FullStateChanged().
+ //
+ // Second, it forces restoring the full state of the application
+ // thread. A platform can support lazy loading/restoring of a thread
+ // state. This means that if the Sentry has not changed a thread state,
+ // the platform may not restore it.
+ //
+ // Preconditions: The caller must be running on the task goroutine.
+ FullStateChanged()
// Interrupt interrupts a concurrent call to Switch(), causing it to return
// ErrContextInterrupt.
@@ -204,20 +296,32 @@ type AddressSpace interface {
// physical memory) to the mapping. The precommit flag is advisory and
// implementations may choose to ignore it.
//
- // Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
- // at.Any() == true. At least one reference must be held on all pages in
- // fr, and must continue to be held as long as pages are mapped.
- MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error
+ // Preconditions:
+ // * addr and fr must be page-aligned.
+ // * fr.Length() > 0.
+ // * at.Any() == true.
+ // * At least one reference must be held on all pages in fr, and must
+ // continue to be held as long as pages are mapped.
+ MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
// Unmap unmaps the given range.
//
- // Preconditions: addr is page-aligned. length > 0.
+ // Preconditions:
+ // * addr is page-aligned.
+ // * length > 0.
Unmap(addr usermem.Addr, length uint64)
// Release releases this address space. After releasing, a new AddressSpace
// must be acquired via platform.NewAddressSpace().
Release()
+ // PreFork() is called before creating a copy of AddressSpace. This
+ // guarantees that this address space will be in a consistent state.
+ PreFork()
+
+ // PostFork() is called after creating a copy of AddressSpace.
+ PostFork()
+
// AddressSpaceIO methods are supported iff the associated platform's
// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
// does not hold may panic if AddressSpaceIO methods are invoked.
@@ -310,52 +414,6 @@ func (f SegmentationFault) Error() string {
return fmt.Sprintf("segmentation fault at %#x", f.Addr)
}
-// File represents a host file that may be mapped into an AddressSpace.
-type File interface {
- // All pages in a File are reference-counted.
-
- // IncRef increments the reference count on all pages in fr.
- //
- // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
- // 0. At least one reference must be held on all pages in fr. (The File
- // interface does not provide a way to acquire an initial reference;
- // implementors may define mechanisms for doing so.)
- IncRef(fr FileRange)
-
- // DecRef decrements the reference count on all pages in fr.
- //
- // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
- // 0. At least one reference must be held on all pages in fr.
- DecRef(fr FileRange)
-
- // MapInternal returns a mapping of the given file offsets in the invoking
- // process' address space for reading and writing.
- //
- // Note that fr.Start and fr.End need not be page-aligned.
- //
- // Preconditions: fr.Length() > 0. At least one reference must be held on
- // all pages in fr.
- //
- // Postconditions: The returned mapping is valid as long as at least one
- // reference is held on the mapped pages.
- MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
-
- // FD returns the file descriptor represented by the File.
- //
- // The only permitted operation on the returned file descriptor is to map
- // pages from it consistent with the requirements of AddressSpace.MapFile.
- FD() int
-}
-
-// FileRange represents a range of uint64 offsets into a File.
-//
-// type FileRange <generated using go_generics>
-
-// String implements fmt.Stringer.String.
-func (fr FileRange) String() string {
- return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
-}
-
// Requirements is used to specify platform specific requirements.
type Requirements struct {
// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 30402c2df..fc43cc3c0 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -24,12 +24,13 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/log",
"//pkg/procid",
"//pkg/safecopy",
"//pkg/seccomp",
"//pkg/sentry/arch",
- "//pkg/sentry/hostcpu",
+ "//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
"//pkg/sync",
diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go
index 1e07cfd0d..b0970e356 100644
--- a/pkg/sentry/platform/ptrace/filters.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -24,10 +24,9 @@ import (
// SyscallFilters returns syscalls made exclusively by the ptrace platform.
func (*PTrace) SyscallFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
- unix.SYS_GETCPU: {},
- unix.SYS_SCHED_SETAFFINITY: {},
- syscall.SYS_PTRACE: {},
- syscall.SYS_TGKILL: {},
- syscall.SYS_WAIT4: {},
+ unix.SYS_GETCPU: {},
+ syscall.SYS_PTRACE: {},
+ syscall.SYS_TGKILL: {},
+ syscall.SYS_WAIT4: {},
}
}
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 08d055e05..f56aa3b79 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -48,6 +48,7 @@ import (
"os"
"gvisor.dev/gvisor/pkg/abi/linux"
+ pkgcontext "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
@@ -95,7 +96,8 @@ type context struct {
}
// Switch runs the provided context in the given address space.
-func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ as := mm.AddressSpace()
s := as.(*subprocess)
isSyscall := s.switchToApp(c, ac)
@@ -180,10 +182,17 @@ func (c *context) Interrupt() {
// Release implements platform.Context.Release().
func (c *context) Release() {}
+// FullStateChanged implements platform.Context.FullStateChanged.
+func (c *context) FullStateChanged() {}
+
+// PullFullState implements platform.Context.PullFullState.
+func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
+
// PTrace represents a collection of ptrace subprocesses.
type PTrace struct {
platform.MMapMinAddr
platform.NoCPUPreemptionDetection
+ platform.UseHostGlobalMemoryBarrier
}
// New returns a new ptrace-based implementation of the platform interface.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2389423b0..812ab80ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -517,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
defer c.interrupt.Disable()
- // Ensure that the CPU set is bound appropriately; this makes the
- // emulation below several times faster, presumably by avoiding
- // interprocessor wakeups and by simplifying the schedule.
- t.bind()
-
// Set registers.
if err := t.setRegs(regs); err != nil {
panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
@@ -616,7 +612,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
}
// MapFile implements platform.AddressSpace.MapFile.
-func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
var flags int
if precommit {
flags |= syscall.MAP_POPULATE
@@ -661,3 +657,9 @@ func (s *subprocess) Unmap(addr usermem.Addr, length uint64) {
panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
}
}
+
+// PreFork implements platform.AddressSpace.PreFork.
+func (s *subprocess) PreFork() {}
+
+// PostFork implements platform.AddressSpace.PostFork.
+func (s *subprocess) PostFork() {}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 84b699f0d..020bbda79 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -201,7 +201,7 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi
seccomp.RuleSet{
Rules: seccomp.SyscallRules{
syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+ {seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)},
},
},
Action: linux.SECCOMP_RET_ALLOW,
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2ce528601..8548853da 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -80,9 +80,9 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
Rules: seccomp.SyscallRules{
syscall.SYS_CLONE: []seccomp.Rule{
// Allow creation of new subprocesses (used by the master).
- {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+ {seccomp.EqualTo(syscall.CLONE_FILES | syscall.SIGKILL)},
// Allow creation of new threads within a single address space (used by addresss spaces).
- {seccomp.AllowValue(
+ {seccomp.EqualTo(
syscall.CLONE_FILES |
syscall.CLONE_FS |
syscall.CLONE_SIGHAND |
@@ -97,14 +97,14 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
// For the stub prctl dance (all).
syscall.SYS_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+ {seccomp.EqualTo(syscall.PR_SET_PDEATHSIG), seccomp.EqualTo(syscall.SIGKILL)},
},
syscall.SYS_GETPPID: {},
// For the stub to stop itself (all).
syscall.SYS_GETPID: {},
syscall.SYS_KILL: []seccomp.Rule{
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SIGSTOP)},
},
// Injected to support the address space operations.
@@ -115,7 +115,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
})
}
rules = appendArchSeccompRules(rules, defaultAction)
- instrs, err := seccomp.BuildProgram(rules, defaultAction)
+ instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 245b20722..533e45497 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,29 +18,12 @@
package ptrace
import (
- "sync/atomic"
"syscall"
"unsafe"
- "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/hostcpu"
- "gvisor.dev/gvisor/pkg/sync"
)
-// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
-// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
-// just records the number of CPUs available in the scheduler affinity set at
-// startup. This may a) change over time and b) gives a number far lower than
-// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
-// use a pool to store large masks that we can reuse during bind.
-var maskPool = sync.Pool{
- New: func() interface{} {
- const maxCPUs = 1024 // Not a hard limit; see below.
- return make([]uintptr, maxCPUs/64)
- },
-}
-
// unmaskAllSignals unmasks all signals on the current thread.
//
//go:nosplit
@@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno {
_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
return errno
}
-
-// setCPU sets the CPU affinity.
-func (t *thread) setCPU(cpu uint32) error {
- mask := maskPool.Get().([]uintptr)
- n := int(cpu / 64)
- v := uintptr(1 << uintptr(cpu%64))
- if n >= len(mask) {
- // See maskPool note above. We've actually exceeded the number
- // of available cores. Grow the mask and return it.
- mask = make([]uintptr, n+1)
- }
- mask[n] |= v
- if _, _, errno := syscall.RawSyscall(
- unix.SYS_SCHED_SETAFFINITY,
- uintptr(t.tid),
- uintptr(len(mask)*8),
- uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
- return errno
- }
- mask[n] &^= v
- maskPool.Put(mask)
- return nil
-}
-
-// bind attempts to ensure that the thread is on the same CPU as the current
-// thread. This provides no guarantees as it is fundamentally a racy operation:
-// CPU sets may change and we may be rescheduled in the middle of this
-// operation. As a result, no failures are reported.
-//
-// Precondition: the current runtime thread should be locked.
-func (t *thread) bind() {
- currentCPU := hostcpu.GetCPU()
-
- if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
- // Set the affinity on the thread and save the CPU for next
- // round; we don't expect CPUs to bounce around too frequently.
- //
- // (It's worth noting that we could move CPUs between this point
- // and when the tracee finishes executing. But that would be
- // roughly the status quo anyways -- we're just maximizing our
- // chances of colocation, not guaranteeing it.)
- t.setCPU(currentCPU)
- }
-}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 0bee995e4..7ee20d89a 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.16
+// +build !go1.17
// Check go:linkname function signatures when updating Go version.
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 8122ac6e2..87a573cc4 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -26,30 +26,31 @@ const (
_PMD_PGT_SIZE = 0x4000
_PTE_PGT_BASE = 0x7000
_PTE_PGT_SIZE = 0x1000
-
- _PSR_D_BIT = 0x00000200
- _PSR_A_BIT = 0x00000100
- _PSR_I_BIT = 0x00000080
- _PSR_F_BIT = 0x00000040
)
const (
- // PSR bits
- PSR_MODE_EL0t = 0x00000000
- PSR_MODE_EL1t = 0x00000004
- PSR_MODE_EL1h = 0x00000005
- PSR_MODE_MASK = 0x0000000f
+ // DAIF bits:debug, sError, IRQ, FIQ.
+ _PSR_D_BIT = 0x00000200
+ _PSR_A_BIT = 0x00000100
+ _PSR_I_BIT = 0x00000080
+ _PSR_F_BIT = 0x00000040
+ _PSR_DAIF_SHIFT = 6
+ _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT
- // KernelFlagsSet should always be set in the kernel.
- KernelFlagsSet = PSR_MODE_EL1h
+ // PSR bits.
+ _PSR_MODE_EL0t = 0x00000000
+ _PSR_MODE_EL1t = 0x00000004
+ _PSR_MODE_EL1h = 0x00000005
+ _PSR_MODE_MASK = 0x0000000f
- // UserFlagsSet are always set in userspace.
- UserFlagsSet = PSR_MODE_EL0t
+ PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK
+ PsrModeMask = _PSR_MODE_MASK
- KernelFlagsClear = PSR_MODE_MASK
- UserFlagsClear = PSR_MODE_MASK
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
- PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
+ // UserFlagsSet are always set in userspace.
+ UserFlagsSet = _PSR_MODE_EL0t
)
// Vector is an exception vector.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 9c6c2cf5c..00899273e 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -76,15 +76,41 @@ type KernelOpts struct {
type KernelArchState struct {
KernelOpts
+ // cpuEntries is array of kernelEntry for all cpus
+ cpuEntries []kernelEntry
+
// globalIDT is our set of interrupt gates.
- globalIDT idt64
+ globalIDT *idt64
}
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
+// kernelEntry contains minimal CPU-specific arch state
+// that can be mapped at the upper of the address space.
+// Malicious APP might steal info from it via CPU bugs.
+type kernelEntry struct {
// stack is the stack used for interrupts on this CPU.
stack [256]byte
+ // scratch space for temporary usage.
+ scratch0 uint64
+
+ // stackTop is the top of the stack.
+ stackTop uint64
+
+ // cpuSelf is back reference to CPU.
+ cpuSelf *CPU
+
+ // kernelCR3 is the cr3 used for sentry kernel.
+ kernelCR3 uintptr
+
+ // gdt is the CPU's descriptor table.
+ gdt descriptorTable
+
+ // tss is the CPU's task state.
+ tss TaskState64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
// errorCode is the error code from the last exception.
errorCode uintptr
@@ -97,11 +123,7 @@ type CPUArchState struct {
// exception.
errorType uintptr
- // gdt is the CPU's descriptor table.
- gdt descriptorTable
-
- // tss is the CPU's task state.
- tss TaskState64
+ *kernelEntry
}
// ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 0e2ab716c..508236e46 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -77,6 +77,9 @@ type CPUArchState struct {
// lazyVFP is the value of cpacr_el1.
lazyVFP uintptr
+
+ // appASID is the asid value of guest application.
+ appASID uintptr
}
// ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 7fa43c2f5..d87b1fd00 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -36,12 +36,15 @@ func sysenter()
// This must be called prior to sysret/iret.
func swapgs()
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
// sysret returns to userspace from a system call.
//
// The return code is the vector that interrupted execution.
//
// See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *arch.Registers) Vector
+func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
// "iret is the cadillac of CPL switching."
//
@@ -50,7 +53,7 @@ func sysret(*CPU, *arch.Registers) Vector
// iret is nearly identical to sysret, except an iret is used to fully restore
// all user state. This must be called in cases where all registers need to be
// restored.
-func iret(*CPU, *arch.Registers) Vector
+func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
// exception is the generic exception entry.
//
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 02df38331..f59747df3 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -63,6 +63,15 @@
MOVQ offset+PTRACE_RSI(reg), SI; \
MOVQ offset+PTRACE_RDI(reg), DI;
+// WRITE_CR3() writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// mov %rax, %cr3
+//
+#define WRITE_CR3() \
+ BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+
// SWAP_GS swaps the kernel GS (CPU).
#define SWAP_GS() \
BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
@@ -75,15 +84,9 @@
#define SYSRET64() \
BYTE $0x48; BYTE $0x0f; BYTE $0x07;
-// LOAD_KERNEL_ADDRESS loads a kernel address.
-#define LOAD_KERNEL_ADDRESS(from, to) \
- MOVQ from, to; \
- ORQ ·KernelStartAddress(SB), to;
-
// LOAD_KERNEL_STACK loads the kernel stack.
-#define LOAD_KERNEL_STACK(from) \
- LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
- LEAQ CPU_STACK_TOP(SP), SP;
+#define LOAD_KERNEL_STACK(entry) \
+ MOVQ ENTRY_STACK_TOP(entry), SP;
// See kernel.go.
TEXT ·Halt(SB),NOSPLIT,$0
@@ -95,58 +98,93 @@ TEXT ·swapgs(SB),NOSPLIT,$0
SWAP_GS()
RET
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+ MOVQ 0(SP), AX
+ ORQ ·KernelStartAddress(SB), AX // Future return value.
+ MOVQ AX, 0(SP)
+ RET
+
// See entry_amd64.go.
TEXT ·sysret(SB),NOSPLIT,$0-24
- // Save original state.
- LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
- LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ CALL ·jumpToKernel(SB)
+ // Save original state and stack. sysenter() or exception()
+ // from APP(gr3) will switch to this stack, set the return
+ // value (vector: 32(SP)) and then do RET, which will also
+ // automatically return to the lower half.
+ MOVQ cpu+0(FP), BX
+ MOVQ regs+8(FP), AX
+ MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+ // save SP AX userCR3 on the kernel stack.
+ MOVQ CPU_ENTRY(BX), BX
+ LOAD_KERNEL_STACK(BX)
+ PUSHQ PTRACE_RSP(AX)
+ PUSHQ PTRACE_RAX(AX)
+ PUSHQ CX
+
// Restore user register state.
REGISTERS_LOAD(AX, 0)
MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET.
MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
- MOVQ PTRACE_RSP(AX), SP // Restore the stack directly.
- MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+
+ // restore userCR3, AX, SP.
+ POPQ AX // Get userCR3.
+ WRITE_CR3() // Switch to userCR3.
+ POPQ AX // Restore AX.
+ POPQ SP // Restore SP.
SYSRET64()
// See entry_amd64.go.
TEXT ·iret(SB),NOSPLIT,$0-24
- // Save original state.
- LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
- LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ CALL ·jumpToKernel(SB)
+ // Save original state and stack. sysenter() or exception()
+ // from APP(gr3) will switch to this stack, set the return
+ // value (vector: 32(SP)) and then do RET, which will also
+ // automatically return to the lower half.
+ MOVQ cpu+0(FP), BX
+ MOVQ regs+8(FP), AX
+ MOVQ userCR3+16(FP), CX
MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
// Build an IRET frame & restore state.
+ MOVQ CPU_ENTRY(BX), BX
LOAD_KERNEL_STACK(BX)
- MOVQ PTRACE_SS(AX), BX; PUSHQ BX
- MOVQ PTRACE_RSP(AX), CX; PUSHQ CX
- MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
- MOVQ PTRACE_CS(AX), DI; PUSHQ DI
- MOVQ PTRACE_RIP(AX), SI; PUSHQ SI
- REGISTERS_LOAD(AX, 0) // Restore most registers.
- MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+ PUSHQ PTRACE_SS(AX)
+ PUSHQ PTRACE_RSP(AX)
+ PUSHQ PTRACE_FLAGS(AX)
+ PUSHQ PTRACE_CS(AX)
+ PUSHQ PTRACE_RIP(AX)
+ PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack.
+ PUSHQ CX // Save userCR3 on kernel stack.
+ REGISTERS_LOAD(AX, 0) // Restore most registers.
+ POPQ AX // Get userCR3.
+ WRITE_CR3() // Switch to userCR3.
+ POPQ AX // Restore AX.
IRET()
// See entry_amd64.go.
TEXT ·resume(SB),NOSPLIT,$0
// See iret, above.
- MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX
- MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
- MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI
- MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI
- REGISTERS_LOAD(GS, CPU_REGISTERS)
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
+ PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
+ REGISTERS_LOAD(AX, CPU_REGISTERS)
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
IRET()
// See entry_amd64.go.
TEXT ·Start(SB),NOSPLIT,$0
- LOAD_KERNEL_STACK(AX) // Set the stack.
PUSHQ $0x0 // Previous frame pointer.
MOVQ SP, BP // Set frame pointer.
PUSHQ AX // First argument (CPU).
@@ -155,53 +193,60 @@ TEXT ·Start(SB),NOSPLIT,$0
// See entry_amd64.go.
TEXT ·sysenter(SB),NOSPLIT,$0
- // Interrupts are always disabled while we're executing in kernel mode
- // and always enabled while executing in user mode. Therefore, we can
- // reliably look at the flags in R11 to determine where this syscall
- // was from.
- TESTL $_RFLAGS_IF, R11
+ // _RFLAGS_IOPL0 is always set in the user mode and it is never set in
+ // the kernel mode. See the comment of UserFlagsSet for more details.
+ TESTL $_RFLAGS_IOPL0, R11
JZ kernel
-
user:
SWAP_GS()
- XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
- XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+ MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch.
+ MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
+ WRITE_CR3() // Switch to kernel cr3.
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value.
- MOVQ BX, PTRACE_RAX(AX) // Save everything else.
- MOVQ BX, PTRACE_ORIGRAX(AX)
MOVQ CX, PTRACE_RIP(AX)
MOVQ R11, PTRACE_FLAGS(AX)
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
- MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
- MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+ MOVQ SP, PTRACE_RSP(AX)
+ MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value.
+ MOVQ CX, PTRACE_RAX(AX) // Save everything else.
+ MOVQ CX, PTRACE_ORIGRAX(AX)
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks.
+ MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
+ MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
// Return to the kernel, where the frame is:
//
- // vector (sp+24)
+ // vector (sp+32)
+ // userCR3 (sp+24)
// regs (sp+16)
// cpu (sp+8)
// vcpu.Switch (sp+0)
//
- MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
- MOVQ $Syscall, 24(SP) // Output vector.
+ MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+ MOVQ $Syscall, 32(SP) // Output vector.
RET
kernel:
// We can't restore the original stack, but we can access the registers
// in the CPU state directly. No need for temporary juggling.
- MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
- MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
- REGISTERS_SAVE(GS, CPU_REGISTERS)
- MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS)
- MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
- MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS)
- MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
- MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+ MOVQ AX, ENTRY_SCRATCH0(GS)
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ REGISTERS_SAVE(AX, CPU_REGISTERS)
+ MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX)
+ MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
+ MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX)
+ MOVQ ENTRY_SCRATCH0(GS), BX
+ MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+ MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+ MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
+ MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
// Call the syscall trampoline.
LOAD_KERNEL_STACK(GS)
- MOVQ CPU_SELF(GS), AX // Load vCPU.
PUSHQ AX // First argument (vCPU).
CALL ·kernelSyscall(SB) // Call the trampoline.
POPQ AX // Pop vCPU.
@@ -230,16 +275,21 @@ TEXT ·exception(SB),NOSPLIT,$0
// ERROR_CODE (sp+8)
// VECTOR (sp+0)
//
- TESTL $_RFLAGS_IF, 32(SP)
+ TESTL $_RFLAGS_IOPL0, 32(SP)
JZ kernel
user:
SWAP_GS()
ADDQ $-8, SP // Adjust for flags.
MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
- XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for user regs.
+ PUSHQ AX // Save user AX on stack.
+ MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX.
+ WRITE_CR3() // Switch to kernel cr3.
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs.
REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
- MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Restore original AX.
+ POPQ BX // Restore original AX.
MOVQ BX, PTRACE_RAX(AX) // Save it.
MOVQ BX, PTRACE_ORIGRAX(AX)
MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
@@ -249,34 +299,36 @@ user:
MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
// Copy out and return.
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ 0(SP), BX // Load vector.
MOVQ 8(SP), CX // Load error code.
- MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
- MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
- MOVQ CX, CPU_ERROR_CODE(GS) // Set error code.
- MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
- MOVQ BX, 24(SP) // Output vector.
+ MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
+ MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+ MOVQ CX, CPU_ERROR_CODE(AX) // Set error code.
+ MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
+ MOVQ BX, 32(SP) // Output vector.
RET
kernel:
// As per above, we can save directly.
- MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
- MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
- REGISTERS_SAVE(GS, CPU_REGISTERS)
- MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
- MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
- MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+ PUSHQ AX
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ REGISTERS_SAVE(AX, CPU_REGISTERS)
+ POPQ BX
+ MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+ MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+ MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
+ MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
+ MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
// Set the error code and adjust the stack.
- MOVQ 8(SP), AX // Load the error code.
- MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
- MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+ MOVQ 8(SP), BX // Load the error code.
+ MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
+ MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
MOVQ 0(SP), BX // BX contains the vector.
- ADDQ $48, SP // Drop the exception frame.
// Call the exception trampoline.
LOAD_KERNEL_STACK(GS)
- MOVQ CPU_SELF(GS), AX // Load vCPU.
PUSHQ BX // Second argument (vector).
PUSHQ AX // First argument (vCPU).
CALL ·kernelException(SB) // Call the trampoline.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index db6465663..1079a024b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -27,7 +27,9 @@
// ERET returns using the ELR and SPSR for the current exception level.
#define ERET() \
- WORD $0xd69f03e0
+ WORD $0xd69f03e0; \
+ DSB $7; \
+ ISB $15;
// RSV_REG is a register that holds el1 information temporarily.
#define RSV_REG R18_PLATFORM
@@ -40,6 +42,22 @@
#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
+// sctlr_el1: system control register el1.
+#define SCTLR_M 1 << 0
+#define SCTLR_C 1 << 2
+#define SCTLR_I 1 << 12
+#define SCTLR_DZE 1 << 14
+#define SCTLR_UCT 1 << 15
+#define SCTLR_UCI 1 << 26
+
+#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE)
+
+// cntkctl_el1: counter-timer kernel control register el1.
+#define CNTKCTL_EL0PCTEN 1 << 0
+#define CNTKCTL_EL0VCTEN 1 << 1
+
+#define CNTKCTL_EL1_DEFAULT (CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN)
+
// Saves a register set.
//
// This is a macro because it may need to executed in contents where a stack is
@@ -280,29 +298,27 @@
LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
MOVD RSV_REG, RSP; \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
- ISB $15; \
- DSB $15;
+ WORD $0xd538d092; //MRS TPIDR_EL1, R18
// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
#define SWITCH_TO_APP_PAGETABLE(from) \
- MOVD CPU_TTBR0_APP(from), RSV_REG; \
- WORD $0xd5182012; \ // MSR R18, TTBR0_EL1
+ MRS TTBR1_EL1, R0; \
+ MOVD CPU_APP_ASID(from), R1; \
+ BFI $48, R1, $16, R0; \
+ MSR R0, TTBR1_EL1; \ // set the ASID in TTBR1_EL1 (since TCR.A1 is set)
ISB $15; \
- DSB $15;
+ MOVD CPU_TTBR0_APP(from), RSV_REG; \
+ MSR RSV_REG, TTBR0_EL1;
// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
#define SWITCH_TO_KVM_PAGETABLE(from) \
- MOVD CPU_TTBR0_KVM(from), RSV_REG; \
- WORD $0xd5182012; \ // MSR R18, TTBR0_EL1
+ MRS TTBR1_EL1, R0; \
+ MOVD $1, R1; \
+ BFI $48, R1, $16, R0; \
+ MSR R0, TTBR1_EL1; \
ISB $15; \
- DSB $15;
-
-#define IRQ_ENABLE \
- MSR $2, DAIFSet;
-
-#define IRQ_DISABLE \
- MSR $2, DAIFClr;
+ MOVD CPU_TTBR0_KVM(from), RSV_REG; \
+ MSR RSV_REG, TTBR0_EL1;
#define VFP_ENABLE \
MOVD $FPEN_ENABLE, R0; \
@@ -318,29 +334,30 @@
#define KERNEL_ENTRY_FROM_EL0 \
SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack.
STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18, step2, switch user pagetable.
- SWITCH_TO_KVM_PAGETABLE(RSV_REG); \
- WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
- MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer.
- REGISTERS_SAVE(RSV_REG_APP, 0); \ // step4, save app context.
+ WORD $0xd538d092; \ // MRS TPIDR_EL1, R18
+ MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer.
+ REGISTERS_SAVE(RSV_REG_APP, 0); \ // step3, save app context.
MOVD RSV_REG_APP, R20; \
LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \
ADD $16, RSP, RSP; \
MOVD RSV_REG, PTRACE_R18(R20); \
MOVD RSV_REG_APP, PTRACE_R9(R20); \
- MOVD R20, RSV_REG_APP; \
+ MRS TPIDR_EL0, R3; \
+ MOVD R3, PTRACE_TLS(R20); \
WORD $0xd5384003; \ // MRS SPSR_EL1, R3
- MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \
+ MOVD R3, PTRACE_PSTATE(R20); \
MRS ELR_EL1, R3; \
- MOVD R3, PTRACE_PC(RSV_REG_APP); \
+ MOVD R3, PTRACE_PC(R20); \
WORD $0xd5384103; \ // MRS SP_EL0, R3
- MOVD R3, PTRACE_SP(RSV_REG_APP);
+ MOVD R3, PTRACE_SP(R20);
// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
#define KERNEL_ENTRY_FROM_EL1 \
WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context.
MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
+ MRS TPIDR_EL0, R4; \
+ MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \
WORD $0xd5384004; \ // MRS SPSR_EL1, R4
MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
MRS ELR_EL1, R4; \
@@ -349,6 +366,26 @@
MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack.
+// EXCEPTION_WITH_ERROR is a common exception handler function.
+#define EXCEPTION_WITH_ERROR(user, vector) \
+ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
+ WORD $0xd538601a; \ //MRS FAR_EL1, R26
+ MOVD R26, CPU_FAULT_ADDR(RSV_REG); \
+ MOVD $user, R3; \
+ MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user.
+ MOVD $vector, R3; \
+ MOVD R3, CPU_VECTOR_CODE(RSV_REG); \
+ MRS ESR_EL1, R3; \
+ MOVD R3, CPU_ERROR_CODE(RSV_REG); \
+ B ·kernelExitToEl1(SB);
+
+// storeAppASID writes the application's asid value.
+TEXT ·storeAppASID(SB),NOSPLIT,$0-8
+ MOVD asid+0(FP), R1
+ MRS TPIDR_EL1, RSV_REG
+ MOVD R1, CPU_APP_ASID(RSV_REG)
+ RET
+
// Halt halts execution.
TEXT ·Halt(SB),NOSPLIT,$0
// Clear bluepill.
@@ -356,15 +393,24 @@ TEXT ·Halt(SB),NOSPLIT,$0
CMP RSV_REG, R9
BNE mmio_exit
MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+
mmio_exit:
// Disable fpsimd.
WORD $0xd5381041 // MRS CPACR_EL1, R1
MOVD R1, CPU_LAZY_VFP(RSV_REG)
VFP_DISABLE
- // MMIO_EXIT.
- MOVD $0, R9
- MOVD R0, 0xffff000000001000(R9)
+ // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT.
+ //
+ // To keep it simple, I used the address of exception table as the
+ // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing
+ // a read-only space.
+ // Also, the length is engough to match a sufficient number of hypercall ID.
+ // Then, in host user space, I can calculate this address to find out
+ // which hypercall.
+ MRS VBAR_EL1, R9
+ MOVD R0, 0x0(R9)
+
RET
// HaltAndResume halts execution and point the pointer to the resume function.
@@ -392,14 +438,17 @@ TEXT ·Current(SB),NOSPLIT,$0-8
MOVD R8, ret+0(FP)
RET
-#define STACK_FRAME_SIZE 16
+#define STACK_FRAME_SIZE 32
// kernelExitToEl0 is the entrypoint for application in guest_el0.
// Prepare the vcpu environment for container application.
TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
// Step1, save sentry context into memory.
+ MRS TPIDR_EL1, RSV_REG
REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+ MRS TPIDR_EL0, R3
+ MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG)
WORD $0xd5384003 // MRS SPSR_EL1, R3
MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
@@ -409,34 +458,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3
- // Step2, save SP_EL1, PSTATE into kernel temporary stack.
- // switch to temporary stack.
+ // Step2, switch to temporary stack.
LOAD_KERNEL_STACK(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
- SUB $STACK_FRAME_SIZE, RSP, RSP
- MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11
- MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12
- STP (R11, R12), 16*0(RSP)
-
- MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11
- MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12
-
- // Step3, test user pagetable.
- // If user pagetable is empty, trapped in el1_ia.
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- SWITCH_TO_APP_PAGETABLE(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- SWITCH_TO_KVM_PAGETABLE(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
- // If pagetable is not empty, recovery kernel temporary stack.
- ADD $STACK_FRAME_SIZE, RSP, RSP
- // Step4, load app context pointer.
+ // Step3, load app context pointer.
MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP
- // Step5, prepare the environment for container application.
+ // Step4, prepare the environment for container application.
// set sp_el0.
MOVD PTRACE_SP(RSV_REG_APP), R1
WORD $0xd5184101 //MSR R1, SP_EL0
@@ -447,8 +475,18 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
MOVD PTRACE_PSTATE(RSV_REG_APP), R1
WORD $0xd5184001 //MSR R1, SPSR_EL1
+ // need use kernel space address to excute below code, since
+ // after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's
+ // ASID.
+ WORD $0x10000061 // ADR R1, do_exit_to_el0
+ ORR $0xffff000000000000, R1, R1
+ JMP (R1)
+
+do_exit_to_el0:
// RSV_REG & RSV_REG_APP will be loaded at the end.
REGISTERS_LOAD(RSV_REG_APP, 0)
+ MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG
+ MSR RSV_REG, TPIDR_EL0
// switch to user pagetable.
MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
@@ -456,11 +494,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
SUB $STACK_FRAME_SIZE, RSP, RSP
STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+ STP (R0, R1), 16*1(RSP)
WORD $0xd538d092 //MRS TPIDR_EL1, R18
SWITCH_TO_APP_PAGETABLE(RSV_REG)
+ LDP 16*1(RSP), (R0, R1)
LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
ADD $STACK_FRAME_SIZE, RSP, RSP
@@ -470,7 +510,6 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
// Prepare the vcpu environment for sentry.
TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1
WORD $0xd5184001 //MSR R1, SPSR_EL1
@@ -480,6 +519,9 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
MOVD R1, RSP
+ SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+ MRS TPIDR_EL1, RSV_REG
+
REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
@@ -487,7 +529,13 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
// Start is the CPU entrypoint.
TEXT ·Start(SB),NOSPLIT,$0
- IRQ_DISABLE
+ // Init.
+ MOVD $SCTLR_EL1_DEFAULT, R1
+ MSR R1, SCTLR_EL1
+
+ MOVD $CNTKCTL_EL1_DEFAULT, R1
+ MSR R1, CNTKCTL_EL1
+
MOVD R8, RSV_REG
ORR $0xffff000000000000, RSV_REG, RSV_REG
WORD $0xd518d092 //MSR R18, TPIDR_EL1
@@ -536,6 +584,7 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
B el1_invalid
el1_da:
+el1_ia:
WORD $0xd538d092 //MRS TPIDR_EL1, R18
WORD $0xd538601a //MRS FAR_EL1, R26
@@ -548,9 +597,6 @@ el1_da:
B ·HaltAndResume(SB)
-el1_ia:
- B ·HaltAndResume(SB)
-
el1_sp_pc:
B ·Shutdown(SB)
@@ -622,24 +668,11 @@ el0_svc:
MOVD $Syscall, R3
MOVD R3, CPU_VECTOR_CODE(RSV_REG)
- B ·HaltAndResume(SB)
+ B ·kernelExitToEl1(SB)
el0_da:
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- WORD $0xd538601a //MRS FAR_EL1, R26
-
- MOVD R26, CPU_FAULT_ADDR(RSV_REG)
-
- MOVD $1, R3
- MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
-
- MOVD $PageFault, R3
- MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
- B ·HaltAndResume(SB)
-
el0_ia:
- B ·Shutdown(SB)
+ EXCEPTION_WITH_ERROR(1, PageFault)
el0_fpsimd_acc:
B ·Shutdown(SB)
@@ -654,7 +687,7 @@ el0_sp_pc:
B ·Shutdown(SB)
el0_undef:
- B ·Shutdown(SB)
+ EXCEPTION_WITH_ERROR(1, El0Sync_undef)
el0_dbg:
B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 549f3d228..9742308d8 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -24,7 +24,10 @@ go_binary(
"defs_impl_arm64.go",
"main.go",
],
- visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+ visibility = [
+ "//pkg/sentry/platform/kvm:__pkg__",
+ "//pkg/sentry/platform/ring0:__pkg__",
+ ],
deps = [
"//pkg/cpuid",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 900c0bba7..264be23d3 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -19,8 +19,8 @@ package ring0
// N.B. that constraints on KernelOpts must be satisfied.
//
//go:nosplit
-func (k *Kernel) Init(opts KernelOpts) {
- k.init(opts)
+func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
+ k.init(opts, maxCPUs)
}
// Halt halts execution.
@@ -31,31 +31,57 @@ type defaultHooks struct{}
// KernelSyscall implements Hooks.KernelSyscall.
//
+// +checkescape:all
+//
//go:nosplit
-func (defaultHooks) KernelSyscall() { Halt() }
+func (defaultHooks) KernelSyscall() {
+ Halt()
+}
// KernelException implements Hooks.KernelException.
//
+// +checkescape:all
+//
//go:nosplit
-func (defaultHooks) KernelException(Vector) { Halt() }
+func (defaultHooks) KernelException(Vector) {
+ Halt()
+}
// kernelSyscall is a trampoline.
//
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
+// +checkescape:hard,stack
+//
//go:nosplit
-func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+func kernelSyscall(c *CPU) {
+ c.hooks.KernelSyscall()
+}
// kernelException is a trampoline.
//
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
+// +checkescape:hard,stack
+//
//go:nosplit
-func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+func kernelException(c *CPU, vector Vector) {
+ c.hooks.KernelException(vector)
+}
// Init initializes a new CPU.
//
// Init allows embedding in other objects.
-func (c *CPU) Init(k *Kernel, hooks Hooks) {
- c.self = c // Set self reference.
- c.kernel = k // Set kernel reference.
- c.init() // Perform architectural init.
+func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) {
+ c.self = c // Set self reference.
+ c.kernel = k // Set kernel reference.
+ c.init(cpuID) // Perform architectural init.
// Require hooks.
if hooks != nil {
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0feff8778..3a9dff4cc 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -18,13 +18,42 @@ package ring0
import (
"encoding/binary"
+ "reflect"
+
+ "gvisor.dev/gvisor/pkg/usermem"
)
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables
+ entrySize := reflect.TypeOf(kernelEntry{}).Size()
+ var (
+ entries []kernelEntry
+ padding = 1
+ )
+ for {
+ entries = make([]kernelEntry, maxCPUs+padding-1)
+ totalSize := entrySize * uintptr(maxCPUs+padding-1)
+ addr := reflect.ValueOf(&entries[0]).Pointer()
+ if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize {
+ // The runtime forces power-of-2 alignment for allocations, and we are therefore
+ // safe once the first address is aligned and the chunk is at least a full page.
+ break
+ }
+ padding = padding << 1
+ }
+ k.cpuEntries = entries
+
+ k.globalIDT = &idt64{}
+ if reflect.TypeOf(idt64{}).Size() != usermem.PageSize {
+ panic("Size of globalIDT should be PageSize")
+ }
+ if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 {
+ panic("Allocated globalIDT should be page aligned")
+ }
+
// Setup the IDT, which is uniform.
for v, handler := range handlers {
// Allow Breakpoint and Overflow to be called from all
@@ -39,8 +68,26 @@ func (k *Kernel) init(opts KernelOpts) {
}
}
+func (k *Kernel) EntryRegions() map[uintptr]uintptr {
+ regions := make(map[uintptr]uintptr)
+
+ addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
+ size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
+ end, _ := usermem.Addr(addr + size).RoundUp()
+ regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+ addr = reflect.ValueOf(k.globalIDT).Pointer()
+ size = reflect.TypeOf(idt64{}).Size()
+ end, _ = usermem.Addr(addr + size).RoundUp()
+ regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+ return regions
+}
+
// init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
+ c.kernelEntry = &c.kernel.cpuEntries[cpuID]
+ c.cpuSelf = c
// Null segment.
c.gdt[0].setNull()
@@ -65,6 +112,7 @@ func (c *CPU) init() {
// Set the kernel stack pointer in the TSS (virtual address).
stackAddr := c.StackTop()
+ c.stackTop = stackAddr
c.tss.rsp0Lo = uint32(stackAddr)
c.tss.rsp0Hi = uint32(stackAddr >> 32)
c.tss.ist1Lo = uint32(stackAddr)
@@ -178,10 +226,12 @@ func IsCanonical(addr uint64) bool {
//
// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
//
+// +checkescape:all
+//
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
- kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+ c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
// Sanitize registers.
regs := switchOpts.Registers
@@ -192,20 +242,16 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
// Perform the switch.
swapgs() // GS will be swapped on return.
- WriteFS(uintptr(regs.Fs_base)) // Set application FS.
- WriteGS(uintptr(regs.Gs_base)) // Set application GS.
- LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
- jumpToKernel() // Switch to upper half.
- writeCR3(uintptr(userCR3)) // Change to user address space.
+ WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
+ WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
+ LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
if switchOpts.FullRestore {
- vector = iret(c, regs)
+ vector = iret(c, regs, uintptr(userCR3))
} else {
- vector = sysret(c, regs)
+ vector = sysret(c, regs, uintptr(userCR3))
}
- writeCR3(uintptr(kernelCR3)) // Return to kernel address space.
- jumpToUser() // Return to lower half.
- SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
- WriteFS(uintptr(c.registers.Fs_base)) // Restore kernel FS.
+ SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
+ WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
return
}
@@ -217,7 +263,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
//go:nosplit
func start(c *CPU) {
// Save per-cpu & FS segment.
- WriteGS(kernelAddr(c))
+ WriteGS(kernelAddr(c.kernelEntry))
WriteFS(uintptr(c.registers.Fs_base))
// Initialize floating point.
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index ccacaea6b..b294ccc7c 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,13 +25,13 @@ func HaltAndResume()
func HaltEl1SvcAndResume()
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
// Save the root page tables.
k.PageTables = opts.PageTables
}
// init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
// Set the kernel stack pointer(virtual address).
c.registers.Sp = uint64(c.StackTop())
@@ -53,14 +53,23 @@ func IsCanonical(addr uint64) bool {
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
- // Sanitize registers.
+ storeAppASID(uintptr(switchOpts.UserASID))
+ if switchOpts.Flush {
+ FlushTlbAll()
+ }
+
regs := switchOpts.Registers
- regs.Pstate &= ^uint64(UserFlagsClear)
+ regs.Pstate &= ^uint64(PsrFlagsClear)
regs.Pstate |= UserFlagsSet
+
+ LoadFloatingPoint(switchOpts.FloatingPointState)
+
kernelExitToEl0()
+
+ SaveFloatingPoint(switchOpts.FloatingPointState)
+
vector = c.vecCode
- // Perform the switch.
return
}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index ca968a036..0ec5c3bc5 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -61,21 +61,9 @@ func wrgsbase(addr uintptr)
// wrgsmsr writes to the GS_BASE MSR.
func wrgsmsr(addr uintptr)
-// writeCR3 writes the CR3 value.
-func writeCR3(phys uintptr)
-
-// readCR3 reads the current CR3 value.
-func readCR3() uintptr
-
// readCR2 reads the current CR2 value.
func readCR2() uintptr
-// jumpToKernel jumps to the kernel version of the current RIP.
-func jumpToKernel()
-
-// jumpToUser jumps to the user version of the current RIP.
-func jumpToUser()
-
// fninit initializes the floating point unit.
func fninit()
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 75d742750..2fe83568a 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -127,53 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
BYTE $0x0f; BYTE $0x30; // WRMSR
RET
-// jumpToUser changes execution to the user address.
-//
-// This works by changing the return value to the user version.
-TEXT ·jumpToUser(SB),NOSPLIT,$0
- MOVQ 0(SP), AX
- MOVQ ·KernelStartAddress(SB), BX
- NOTQ BX
- ANDQ BX, SP // Switch the stack.
- ANDQ BX, BP // Switch the frame pointer.
- ANDQ BX, AX // Future return value.
- MOVQ AX, 0(SP)
- RET
-
-// jumpToKernel changes execution to the kernel address space.
-//
-// This works by changing the return value to the kernel version.
-TEXT ·jumpToKernel(SB),NOSPLIT,$0
- MOVQ 0(SP), AX
- MOVQ ·KernelStartAddress(SB), BX
- ORQ BX, SP // Switch the stack.
- ORQ BX, BP // Switch the frame pointer.
- ORQ BX, AX // Future return value.
- MOVQ AX, 0(SP)
- RET
-
-// writeCR3 writes the given CR3 value.
-//
-// The code corresponds to:
-//
-// mov %rax, %cr3
-//
-TEXT ·writeCR3(SB),NOSPLIT,$0-8
- MOVQ cr3+0(FP), AX
- BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
- RET
-
-// readCR3 reads the current CR3 value.
-//
-// The code corresponds to:
-//
-// mov %cr3, %rax
-//
-TEXT ·readCR3(SB),NOSPLIT,$0-8
- BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
- MOVQ AX, ret+0(FP)
- RET
-
// readCR2 reads the current CR2 value.
//
// The code corresponds to:
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index a6345010d..d91a09de1 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,6 +16,15 @@
package ring0
+// storeAppASID writes the application's asid value.
+func storeAppASID(asid uintptr)
+
+// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU.
+func LocalFlushTlbAll()
+
+// FlushTlbAll flush all tlb.
+func FlushTlbAll()
+
// CPACREL1 returns the value of the CPACR_EL1 register.
func CPACREL1() (value uintptr)
@@ -38,11 +47,11 @@ func SaveVRegs(*byte)
// LoadVRegs loads V0-V31 registers.
func LoadVRegs(*byte)
-// GetTLS returns the value of TPIDR_EL0 register.
-func GetTLS() (value uint64)
+// LoadFloatingPoint loads floating point state.
+func LoadFloatingPoint(*byte)
-// SetTLS writes the TPIDR_EL0 value.
-func SetTLS(value uint64)
+// SaveFloatingPoint saves floating point state.
+func SaveFloatingPoint(*byte)
// Init sets function pointers based on architectural features.
//
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index b63e14b41..da9d3cf55 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,14 +15,18 @@
#include "funcdata.h"
#include "textflag.h"
-TEXT ·GetTLS(SB),NOSPLIT,$0-8
- MRS TPIDR_EL0, R1
- MOVD R1, ret+0(FP)
+TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0
+ DSB $6 // dsb(nshst)
+ WORD $0xd508871f // __tlbi(vmalle1)
+ DSB $7 // dsb(nsh)
+ ISB $15
RET
-TEXT ·SetTLS(SB),NOSPLIT,$0-8
- MOVD addr+0(FP), R1
- MSR R1, TPIDR_EL0
+TEXT ·FlushTlbAll(SB),NOSPLIT,$0
+ DSB $10 // dsb(ishst)
+ WORD $0xd508831f // __tlbi(vmalle1is)
+ DSB $11 // dsb(ish)
+ ISB $15
RET
TEXT ·CPACREL1(SB),NOSPLIT,$0-8
@@ -129,3 +133,89 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
ISB $15
RET
+
+TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ MOVD 0(R0), R1
+ MOVD R1, FPSR
+ MOVD 8(R0), R1
+ MOVD R1, NZCV
+
+ FMOVD 16*1(R0), F0
+ FMOVD 16*2(R0), F1
+ FMOVD 16*3(R0), F2
+ FMOVD 16*4(R0), F3
+ FMOVD 16*5(R0), F4
+ FMOVD 16*6(R0), F5
+ FMOVD 16*7(R0), F6
+ FMOVD 16*8(R0), F7
+ FMOVD 16*9(R0), F8
+ FMOVD 16*10(R0), F9
+ FMOVD 16*11(R0), F10
+ FMOVD 16*12(R0), F11
+ FMOVD 16*13(R0), F12
+ FMOVD 16*14(R0), F13
+ FMOVD 16*15(R0), F14
+ FMOVD 16*16(R0), F15
+ FMOVD 16*17(R0), F16
+ FMOVD 16*18(R0), F17
+ FMOVD 16*19(R0), F18
+ FMOVD 16*20(R0), F19
+ FMOVD 16*21(R0), F20
+ FMOVD 16*22(R0), F21
+ FMOVD 16*23(R0), F22
+ FMOVD 16*24(R0), F23
+ FMOVD 16*25(R0), F24
+ FMOVD 16*26(R0), F25
+ FMOVD 16*27(R0), F26
+ FMOVD 16*28(R0), F27
+ FMOVD 16*29(R0), F28
+ FMOVD 16*30(R0), F29
+ FMOVD 16*31(R0), F30
+ FMOVD 16*32(R0), F31
+
+ RET
+
+TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ MOVD FPSR, R1
+ MOVD R1, 0(R0)
+ MOVD NZCV, R1
+ MOVD R1, 8(R0)
+
+ FMOVD F0, 16*1(R0)
+ FMOVD F1, 16*2(R0)
+ FMOVD F2, 16*3(R0)
+ FMOVD F3, 16*4(R0)
+ FMOVD F4, 16*5(R0)
+ FMOVD F5, 16*6(R0)
+ FMOVD F6, 16*7(R0)
+ FMOVD F7, 16*8(R0)
+ FMOVD F8, 16*9(R0)
+ FMOVD F9, 16*10(R0)
+ FMOVD F10, 16*11(R0)
+ FMOVD F11, 16*12(R0)
+ FMOVD F12, 16*13(R0)
+ FMOVD F13, 16*14(R0)
+ FMOVD F14, 16*15(R0)
+ FMOVD F15, 16*16(R0)
+ FMOVD F16, 16*17(R0)
+ FMOVD F17, 16*18(R0)
+ FMOVD F18, 16*19(R0)
+ FMOVD F19, 16*20(R0)
+ FMOVD F20, 16*21(R0)
+ FMOVD F21, 16*22(R0)
+ FMOVD F22, 16*23(R0)
+ FMOVD F23, 16*24(R0)
+ FMOVD F24, 16*25(R0)
+ FMOVD F25, 16*26(R0)
+ FMOVD F26, 16*27(R0)
+ FMOVD F27, 16*28(R0)
+ FMOVD F28, 16*29(R0)
+ FMOVD F29, 16*30(R0)
+ FMOVD F30, 16*31(R0)
+ FMOVD F31, 16*32(R0)
+
+ RET
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index b8ab120a0..ca4075b09 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -30,14 +30,21 @@ func Emit(w io.Writer) {
c := &CPU{}
fmt.Fprintf(w, "\n// CPU offsets.\n")
- fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
+
+ e := &kernelEntry{}
+ fmt.Fprintf(w, "\n// CPU entry offsets.\n")
+ fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
fmt.Fprintf(w, "\n// Bits.\n")
fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
+ fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0)
fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
fmt.Fprintf(w, "\n// Vectors.\n")
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index f3de962f0..45eba960d 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -41,6 +41,7 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_APP_ASID 0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "\n// Bits.\n")
fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
@@ -124,4 +125,5 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define PTRACE_SP 0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer())
fmt.Fprintf(w, "#define PTRACE_PC 0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer())
fmt.Fprintf(w, "#define PTRACE_PSTATE 0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_TLS 0x%02x\n", reflect.ValueOf(&p.TPIDR_EL0).Pointer()-reflect.ValueOf(p).Pointer())
}
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 23fd5c352..8d75b7599 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -53,9 +53,14 @@ type RuntimeAllocator struct {
// NewRuntimeAllocator returns an allocator that uses runtime allocation.
func NewRuntimeAllocator() *RuntimeAllocator {
- return &RuntimeAllocator{
- used: make(map[*PTEs]struct{}),
- }
+ r := new(RuntimeAllocator)
+ r.Init()
+ return r
+}
+
+// Init initializes a RuntimeAllocator.
+func (r *RuntimeAllocator) Init() {
+ r.used = make(map[*PTEs]struct{})
}
// Recycle returns freed pages to the pool.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 87e88e97d..7f18ac296 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true }
//
// Precondition: addr & length must be page-aligned, their sum must not overflow.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
if !opts.AccessType.Any() {
@@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// Precondition: addr & length must be page-aligned.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
w := unmapWalker{
@@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// Precondition: addr & length must be page-aligned.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
w := emptyWalker{
@@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false }
// Lookup returns the physical address for the given virtual address.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 78510ebed..520161755 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -72,13 +72,14 @@ const (
)
const (
- mtNormal = 0x4 << 2
+ mtDevicenGnRE = 0x1 << 2
+ mtNormal = 0x4 << 2
)
const (
executeDisable = xn
- optionMask = 0xfff | 0xfff<<48
- protDefault = accessed | shared | mtNormal
+ optionMask = 0xfff | 0xffff<<48
+ protDefault = accessed | shared
)
// MapOpts are x86 options.
@@ -184,8 +185,10 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
if opts.User {
v |= user
+ v |= mtNormal
} else {
v = v &^ user
+ v |= mtNormal
}
atomic.StoreUintptr((*uintptr)(p), v)
}
@@ -200,7 +203,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
// This should never happen.
panic("unaligned physical address!")
}
- v := addr | typeTable | protDefault
+ v := addr | typeTable | protDefault | mtNormal
atomic.StoreUintptr((*uintptr)(p), v)
}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 1a49f12a2..5ddd10256 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -36,7 +36,7 @@ const (
pudSize = 1 << pudShift
pgdSize = 1 << pgdShift
- ttbrASIDOffset = 55
+ ttbrASIDOffset = 48
ttbrASIDMask = 0xff
entriesPerPage = 512
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 9da0ea685..34fbc1c35 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -39,7 +39,9 @@ const (
_RFLAGS_AC = 1 << 18
_RFLAGS_NT = 1 << 14
- _RFLAGS_IOPL = 3 << 12
+ _RFLAGS_IOPL0 = 1 << 12
+ _RFLAGS_IOPL1 = 1 << 13
+ _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1
_RFLAGS_DF = 1 << 10
_RFLAGS_IF = 1 << 9
_RFLAGS_STEP = 1 << 8
@@ -67,15 +69,45 @@ const (
KernelFlagsSet = _RFLAGS_RESERVED
// UserFlagsSet are always set in userspace.
- UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+ //
+ // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege
+ // level. The Current Privilege Level (CPL) of the task must be less
+ // than or equal to the IOPL in order for the task or program to access
+ // I/O ports.
+ //
+ // Here, _RFLAGS_IOPL0 is used only to determine whether the task is
+ // running in the kernel or userspace mode. In the user mode, the CPL is
+ // always 3 and it doesn't matter what IOPL is set if it is bellow CPL.
+ //
+ // We need to have one bit which will be always different in user and
+ // kernel modes. And we have to remember that even though we have
+ // KernelFlagsClear, we still can see some of these flags in the kernel
+ // mode. This can happen when the goruntime switches on a goroutine
+ // which has been saved in the host mode. On restore, the popf
+ // instruction is used to restore flags and this means that all flags
+ // what the goroutine has in the host mode will be restored in the
+ // kernel mode.
+ //
+ // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set
+ // it in the user mode. So if this flag is set, the task is running in
+ // the user mode and if it isn't set, the task is running in the kernel
+ // mode.
+ UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0
// KernelFlagsClear should always be clear in the kernel.
KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
// UserFlagsClear are always cleared in userspace.
- UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+ UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1
)
+// IsKernelFlags returns true if rflags coresponds to the kernel mode.
+//
+// go:nosplit
+func IsKernelFlags(rflags uint64) bool {
+ return rflags&_RFLAGS_IOPL0 == 0
+}
+
// Vector is an exception vector.
type Vector uintptr
@@ -104,7 +136,7 @@ const (
VirtualizationException
SecurityException = 0x1e
SyscallInt80 = 0x80
- _NR_INTERRUPTS = SyscallInt80 + 1
+ _NR_INTERRUPTS = 0x100
)
// System call vectors.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c40c6d673..a3f775d15 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -10,6 +10,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/context",
+ "//pkg/marshal",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 8b439a078..70ccf77a7 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
for _, fd := range fds {
file := t.GetFile(fd)
if file == nil {
- files.Release()
+ files.Release(t)
return nil, syserror.EBADF
}
files = append(files, file)
@@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (fs *RightsFiles) Release() {
+func (fs *RightsFiles) Release(ctx context.Context) {
for _, f := range *fs {
- f.DecRef()
+ f.DecRef(ctx)
}
*fs = nil
}
@@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32
fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{
CloseOnExec: cloexec,
})
- files[0].DecRef()
+ files[0].DecRef(t)
files = files[1:]
if err != nil {
t.Warningf("Error inserting FD: %v", err)
diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go
index fd08179be..37d02948f 100644
--- a/pkg/sentry/socket/control/control_vfs2.go
+++ b/pkg/sentry/socket/control/control_vfs2.go
@@ -24,6 +24,8 @@ import (
)
// SCMRightsVFS2 represents a SCM_RIGHTS socket control message.
+//
+// +stateify savable
type SCMRightsVFS2 interface {
transport.RightsControlMessage
@@ -34,9 +36,11 @@ type SCMRightsVFS2 interface {
Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool)
}
-// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
-// maintained for each vfs.FileDescription and is release either when an FD is created or
-// when the Release method is called.
+// RightsFilesVFS2 represents a SCM_RIGHTS socket control message. A reference
+// is maintained for each vfs.FileDescription and is release either when an FD
+// is created or when the Release method is called.
+//
+// +stateify savable
type RightsFilesVFS2 []*vfs.FileDescription
// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message
@@ -46,7 +50,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) {
for _, fd := range fds {
file := t.GetFileVFS2(fd)
if file == nil {
- files.Release()
+ files.Release(t)
return nil, syserror.EBADF
}
files = append(files, file)
@@ -78,9 +82,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (fs *RightsFilesVFS2) Release() {
+func (fs *RightsFilesVFS2) Release(ctx context.Context) {
for _, f := range *fs {
- f.DecRef()
+ f.DecRef(ctx)
}
*fs = nil
}
@@ -93,7 +97,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int)
fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{
CloseOnExec: cloexec,
})
- files[0].DecRef()
+ files[0].DecRef(t)
files = files[1:]
if err != nil {
t.Warningf("Error inserting FD: %v", err)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e82d6cd1e..b6ebe29d6 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -21,11 +21,14 @@ go_library(
"//pkg/context",
"//pkg/fdnotifier",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/hostfd",
"//pkg/sentry/inet",
@@ -36,6 +39,9 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/syserr",
"//pkg/syserror",
+ "//pkg/tcpip",
+ "//pkg/tcpip/network/ipv4",
+ "//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c11e82c10..7d3c4a01c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -24,6 +24,8 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -98,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc
return nil, syserr.FromError(err)
}
dirent := socket.NewDirent(ctx, socketDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
+func (s *socketOpsCommon) Release(context.Context) {
fdnotifier.RemoveFD(int32(s.fd))
syscall.Close(s.fd)
}
@@ -267,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int,
syscall.Close(fd)
return 0, nil, 0, err
}
- defer f.DecRef()
+ defer f.DecRef(t)
kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
@@ -279,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int,
syscall.Close(fd)
return 0, nil, 0, err
}
- defer f.DecRef()
+ defer f.DecRef(t)
kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
@@ -319,12 +321,12 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
if outLen < 0 {
return nil, syserr.ErrInvalidArgument
}
- // Whitelist options and constrain option length.
+ // Only allow known and safe options.
optlen := getSockOptLen(t, level, name)
switch level {
case linux.SOL_IP:
@@ -364,12 +366,13 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
if err != nil {
return nil, syserr.FromError(err)
}
- return opt, nil
+ optP := primitive.ByteSlice(opt)
+ return &optP, nil
}
// SetSockOpt implements socket.Socket.SetSockOpt.
func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
- // Whitelist options and constrain option length.
+ // Only allow known and safe options.
optlen := setSockOptLen(t, level, name)
switch level {
case linux.SOL_IP:
@@ -415,7 +418,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []
// RecvMsg implements socket.Socket.RecvMsg.
func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
- // Whitelist flags.
+ // Only allow known and safe flags.
//
// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
// messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the
@@ -537,7 +540,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
// SendMsg implements socket.Socket.SendMsg.
func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
- // Whitelist flags.
+ // Only allow known and safe flags.
if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
return 0, syserr.ErrInvalidArgument
}
@@ -708,6 +711,6 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int
func init() {
for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
socket.RegisterProvider(family, &socketProvider{family})
- socket.RegisterProviderVFS2(family, &socketProviderVFS2{})
+ socket.RegisterProviderVFS2(family, &socketProviderVFS2{family})
}
}
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 677743113..9a2cac40b 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -32,9 +33,11 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
+// +stateify savable
type socketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
// We store metadata for hostinet sockets internally. Technically, we should
// access metadata (e.g. through stat, chmod) on the host for correctness,
@@ -49,7 +52,8 @@ var _ = socket.SocketVFS2(&socketVFS2{})
func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
mnt := t.Kernel().SocketMount()
- d := sockfs.NewDentry(t.Credentials(), mnt)
+ d := sockfs.NewDentry(t, mnt)
+ defer d.DecRef(t)
s := &socketVFS2{
socketOpsCommon: socketOpsCommon{
@@ -59,6 +63,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
fd: fd,
},
}
+ s.LockFD.Init(&vfs.FileLocks{})
if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
return nil, syserr.FromError(err)
}
@@ -68,11 +73,19 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
DenyPWrite: true,
UseDentryMetadata: true,
}); err != nil {
+ fdnotifier.RemoveFD(int32(s.fd))
return nil, syserr.FromError(err)
}
return vfsfd, nil
}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *socketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
+}
+
// Readiness implements waiter.Waitable.Readiness.
func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
return s.socketOpsCommon.Readiness(mask)
@@ -93,7 +106,7 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
return ioctl(ctx, s.fd, uio, args)
}
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.ESPIPE
}
@@ -131,6 +144,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
return int64(n), err
}
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
type socketProviderVFS2 struct {
family int
}
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index a48082631..7e7857ac3 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -30,6 +30,9 @@ import (
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -53,11 +56,14 @@ type Stack struct {
interfaceAddrs map[int32][]inet.InterfaceAddr
routes []inet.Route
supportsIPv6 bool
+ tcpRecovery inet.TCPLossRecovery
tcpRecvBufSize inet.TCPBufferSize
tcpSendBufSize inet.TCPBufferSize
tcpSACKEnabled bool
netDevFile *os.File
netSNMPFile *os.File
+ ipv4Forwarding bool
+ ipv6Forwarding bool
}
// NewStack returns an empty Stack containing no configuration.
@@ -117,6 +123,13 @@ func (s *Stack) Configure() error {
s.netSNMPFile = f
}
+ s.ipv6Forwarding = false
+ if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil {
+ s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0"
+ } else {
+ log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false")
+ }
+
return nil
}
@@ -311,7 +324,12 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
}
// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
-func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+func (s *Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error {
+ return syserror.EACCES
+}
+
+// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr.
+func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error {
return syserror.EACCES
}
@@ -346,7 +364,17 @@ func (s *Stack) TCPSACKEnabled() (bool, error) {
}
// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
-func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
+func (s *Stack) SetTCPSACKEnabled(bool) error {
+ return syserror.EACCES
+}
+
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ return s.tcpRecovery, nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(inet.TCPLossRecovery) error {
return syserror.EACCES
}
@@ -407,18 +435,18 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
}
if rawLine == "" {
- return fmt.Errorf("Failed to get raw line")
+ return fmt.Errorf("failed to get raw line")
}
parts := strings.SplitN(rawLine, ":", 2)
if len(parts) != 2 {
- return fmt.Errorf("Failed to get prefix from: %q", rawLine)
+ return fmt.Errorf("failed to get prefix from: %q", rawLine)
}
sliceStat = toSlice(stat)
fields := strings.Fields(strings.TrimSpace(parts[1]))
if len(fields) != len(sliceStat) {
- return fmt.Errorf("Failed to parse fields: %q", rawLine)
+ return fmt.Errorf("failed to parse fields: %q", rawLine)
}
if _, ok := stat.(*inet.StatSNMPTCP); ok {
snmpTCP = true
@@ -434,7 +462,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64)
}
if err != nil {
- return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err)
+ return fmt.Errorf("failed to parse field %d from: %q, %v", i, rawLine, err)
}
}
@@ -457,3 +485,21 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ switch protocol {
+ case ipv4.ProtocolNumber:
+ return s.ipv4Forwarding
+ case ipv6.ProtocolNumber:
+ return s.ipv6Forwarding
+ default:
+ log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol)
+ return false
+ }
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error {
+ return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 721094bbf..8aea0200f 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -6,6 +6,8 @@ go_library(
name = "netfilter",
srcs = [
"extensions.go",
+ "ipv4.go",
+ "ipv6.go",
"netfilter.go",
"owner_matcher.go",
"targets.go",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 0336a32d8..549787955 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -37,7 +39,7 @@ type matchMaker interface {
// name is the matcher name as stored in the xt_entry_match struct.
name() string
- // marshal converts from an stack.Matcher to an ABI struct.
+ // marshal converts from a stack.Matcher to an ABI struct.
marshal(matcher stack.Matcher) []byte
// unmarshal converts from the ABI matcher struct to an
@@ -93,3 +95,71 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf
}
return matchMaker.unmarshal(buf, filter)
}
+
+// targetMaker knows how to (un)marshal a target. Once registered,
+// marshalTarget and unmarshalTarget can be used.
+type targetMaker interface {
+ // id uniquely identifies the target.
+ id() stack.TargetID
+
+ // marshal converts from a stack.Target to an ABI struct.
+ marshal(target stack.Target) []byte
+
+ // unmarshal converts from the ABI matcher struct to a stack.Target.
+ unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error)
+}
+
+// targetMakers maps the TargetID of supported targets to the targetMaker that
+// marshals and unmarshals it. It is immutable after package initialization.
+var targetMakers = map[stack.TargetID]targetMaker{}
+
+func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
+ tid := stack.TargetID{
+ Name: name,
+ NetworkProtocol: netProto,
+ Revision: rev,
+ }
+ if _, ok := targetMakers[tid]; !ok {
+ return 0, false
+ }
+
+ // Return the highest supported revision unless rev is higher.
+ for _, other := range targetMakers {
+ otherID := other.id()
+ if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev {
+ rev = uint8(otherID.Revision)
+ }
+ }
+ return rev, true
+}
+
+// registerTargetMaker should be called by target extensions to register them
+// with the netfilter package.
+func registerTargetMaker(tm targetMaker) {
+ if _, ok := targetMakers[tm.id()]; ok {
+ panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id()))
+ }
+ targetMakers[tm.id()] = tm
+}
+
+func marshalTarget(target stack.Target) []byte {
+ targetMaker, ok := targetMakers[target.ID()]
+ if !ok {
+ panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID()))
+ }
+ return targetMaker.marshal(target)
+}
+
+func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) {
+ tid := stack.TargetID{
+ Name: target.Name.String(),
+ NetworkProtocol: filter.NetworkProtocol(),
+ Revision: target.Revision,
+ }
+ targetMaker, ok := targetMakers[tid]
+ if !ok {
+ nflog("unsupported target with name %q", target.Name.String())
+ return nil, syserr.ErrInvalidArgument
+ }
+ return targetMaker.unmarshal(buf, filter)
+}
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
new file mode 100644
index 000000000..b560fae0d
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -0,0 +1,265 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv4Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv4Filter = stack.IPHeaderFilter{
+ Dst: "\x00\x00\x00\x00",
+ DstMask: "\x00\x00\x00\x00",
+ Src: "\x00\x00\x00\x00",
+ SrcMask: "\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary4 converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+ // The table name has to fit in the struct.
+ if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+ return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+ }
+
+ table, ok := stk.IPTables().GetTable(tablename.String(), false)
+ if !ok {
+ return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+ }
+
+ // Setup the info struct.
+ entries, info := getEntries4(table, tablename)
+ return entries, info, nil
+}
+
+func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) {
+ var info linux.IPTGetinfo
+ var entries linux.KernelIPTGetEntries
+ copy(info.Name[:], tablename[:])
+ copy(entries.Name[:], info.Name[:])
+ info.ValidHooks = table.ValidHooks()
+
+ for ruleIdx, rule := range table.Rules {
+ nflog("convert to binary: current offset: %d", entries.Size)
+
+ setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+ // Each rule corresponds to an entry.
+ entry := linux.KernelIPTEntry{
+ Entry: linux.IPTEntry{
+ IP: linux.IPTIP{
+ Protocol: uint16(rule.Filter.Protocol),
+ },
+ NextOffset: linux.SizeOfIPTEntry,
+ TargetOffset: linux.SizeOfIPTEntry,
+ },
+ }
+ copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+ copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+ copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+ copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+ copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+ copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+ if rule.Filter.DstInvert {
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+ }
+ if rule.Filter.SrcInvert {
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+ }
+ if rule.Filter.OutputInterfaceInvert {
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+ }
+
+ for _, matcher := range rule.Matchers {
+ // Serialize the matcher and add it to the
+ // entry.
+ serialized := marshalMatcher(matcher)
+ nflog("convert to binary: matcher serialized as: %v", serialized)
+ if len(serialized)%8 != 0 {
+ panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+ }
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.Entry.NextOffset += uint16(len(serialized))
+ entry.Entry.TargetOffset += uint16(len(serialized))
+ }
+
+ // Serialize and append the target.
+ serialized := marshalTarget(rule.Target)
+ if len(serialized)%8 != 0 {
+ panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+ }
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.Entry.NextOffset += uint16(len(serialized))
+
+ nflog("convert to binary: adding entry: %+v", entry)
+
+ entries.Size += uint32(entry.Entry.NextOffset)
+ entries.Entrytable = append(entries.Entrytable, entry)
+ info.NumEntries++
+ }
+
+ info.Size = entries.Size
+ nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+ return entries, info
+}
+
+func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+ nflog("set entries: setting entries in table %q", replace.Name.String())
+
+ // Convert input into a list of rules and their offsets.
+ var offset uint32
+ // offsets maps rule byte offsets to their position in table.Rules.
+ offsets := map[uint32]int{}
+ for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+ nflog("set entries: processing entry at offset %d", offset)
+
+ // Get the struct ipt_entry.
+ if len(optVal) < linux.SizeOfIPTEntry {
+ nflog("optVal has insufficient size for entry %d", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var entry linux.IPTEntry
+ buf := optVal[:linux.SizeOfIPTEntry]
+ binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+ initialOptValLen := len(optVal)
+ optVal = optVal[linux.SizeOfIPTEntry:]
+
+ if entry.TargetOffset < linux.SizeOfIPTEntry {
+ nflog("entry has too-small target offset %d", entry.TargetOffset)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): We should support more IPTIP
+ // filtering fields.
+ filter, err := filterFromIPTIP(entry.IP)
+ if err != nil {
+ nflog("bad iptip: %v", err)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Matchers and targets can specify
+ // that they only work for certain protocols, hooks, tables.
+ // Get matchers.
+ matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+ if len(optVal) < int(matchersSize) {
+ nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ matchers, err := parseMatchers(filter, optVal[:matchersSize])
+ if err != nil {
+ nflog("failed to parse matchers: %v", err)
+ return nil, syserr.ErrInvalidArgument
+ }
+ optVal = optVal[matchersSize:]
+
+ // Get the target of the rule.
+ targetSize := entry.NextOffset - entry.TargetOffset
+ if len(optVal) < int(targetSize) {
+ nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ rule := stack.Rule{
+ Filter: filter,
+ Matchers: matchers,
+ }
+
+ {
+ target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */)
+ if err != nil {
+ nflog("failed to parse target: %v", err)
+ return nil, err
+ }
+ rule.Target = target
+ }
+ optVal = optVal[targetSize:]
+
+ table.Rules = append(table.Rules, rule)
+ offsets[offset] = int(entryIdx)
+ offset += uint32(entry.NextOffset)
+
+ if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+ nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ }
+ return offsets, nil
+}
+
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
+ if containsUnsupportedFields4(iptip) {
+ return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+ }
+ if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+ return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+ }
+ if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+ return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+ }
+
+ n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+ if n == -1 {
+ n = len(iptip.OutputInterface)
+ }
+ ifname := string(iptip.OutputInterface[:n])
+
+ n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+ if n == -1 {
+ n = len(iptip.OutputInterfaceMask)
+ }
+ ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+ return stack.IPHeaderFilter{
+ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+ // A Protocol value of 0 indicates all protocols match.
+ CheckProtocol: iptip.Protocol != 0,
+ Dst: tcpip.Address(iptip.Dst[:]),
+ DstMask: tcpip.Address(iptip.DstMask[:]),
+ DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+ Src: tcpip.Address(iptip.Src[:]),
+ SrcMask: tcpip.Address(iptip.SrcMask[:]),
+ SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
+ OutputInterface: ifname,
+ OutputInterfaceMask: ifnameMask,
+ OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
+ }, nil
+}
+
+func containsUnsupportedFields4(iptip linux.IPTIP) bool {
+ // The following features are supported:
+ // - Protocol
+ // - Dst and DstMask
+ // - Src and SrcMask
+ // - The inverse destination IP check flag
+ // - OutputInterface, OutputInterfaceMask and its inverse.
+ var emptyInterface = [linux.IFNAMSIZ]byte{}
+ // Disable any supported inverse flags.
+ inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+ return iptip.InputInterface != emptyInterface ||
+ iptip.InputInterfaceMask != emptyInterface ||
+ iptip.Flags != 0 ||
+ iptip.InverseFlags&^inverseMask != 0
+}
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
new file mode 100644
index 000000000..4253f7bf4
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -0,0 +1,270 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv6Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv6Filter = stack.IPHeaderFilter{
+ Dst: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ Src: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) {
+ // The table name has to fit in the struct.
+ if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+ return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+ }
+
+ table, ok := stk.IPTables().GetTable(tablename.String(), true)
+ if !ok {
+ return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+ }
+
+ // Setup the info struct, which is the same in IPv4 and IPv6.
+ entries, info := getEntries6(table, tablename)
+ return entries, info, nil
+}
+
+func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) {
+ var info linux.IPTGetinfo
+ var entries linux.KernelIP6TGetEntries
+ copy(info.Name[:], tablename[:])
+ copy(entries.Name[:], info.Name[:])
+ info.ValidHooks = table.ValidHooks()
+
+ for ruleIdx, rule := range table.Rules {
+ nflog("convert to binary: current offset: %d", entries.Size)
+
+ setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+ // Each rule corresponds to an entry.
+ entry := linux.KernelIP6TEntry{
+ Entry: linux.IP6TEntry{
+ IPv6: linux.IP6TIP{
+ Protocol: uint16(rule.Filter.Protocol),
+ },
+ NextOffset: linux.SizeOfIP6TEntry,
+ TargetOffset: linux.SizeOfIP6TEntry,
+ },
+ }
+ copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst)
+ copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask)
+ copy(entry.Entry.IPv6.Src[:], rule.Filter.Src)
+ copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask)
+ copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface)
+ copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+ if rule.Filter.DstInvert {
+ entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP
+ }
+ if rule.Filter.SrcInvert {
+ entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP
+ }
+ if rule.Filter.OutputInterfaceInvert {
+ entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT
+ }
+ if rule.Filter.CheckProtocol {
+ entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO
+ }
+
+ for _, matcher := range rule.Matchers {
+ // Serialize the matcher and add it to the
+ // entry.
+ serialized := marshalMatcher(matcher)
+ nflog("convert to binary: matcher serialized as: %v", serialized)
+ if len(serialized)%8 != 0 {
+ panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+ }
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.Entry.NextOffset += uint16(len(serialized))
+ entry.Entry.TargetOffset += uint16(len(serialized))
+ }
+
+ // Serialize and append the target.
+ serialized := marshalTarget(rule.Target)
+ if len(serialized)%8 != 0 {
+ panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+ }
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.Entry.NextOffset += uint16(len(serialized))
+
+ nflog("convert to binary: adding entry: %+v", entry)
+
+ entries.Size += uint32(entry.Entry.NextOffset)
+ entries.Entrytable = append(entries.Entrytable, entry)
+ info.NumEntries++
+ }
+
+ info.Size = entries.Size
+ nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+ return entries, info
+}
+
+func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+ nflog("set entries: setting entries in table %q", replace.Name.String())
+
+ // Convert input into a list of rules and their offsets.
+ var offset uint32
+ // offsets maps rule byte offsets to their position in table.Rules.
+ offsets := map[uint32]int{}
+ for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+ nflog("set entries: processing entry at offset %d", offset)
+
+ // Get the struct ipt_entry.
+ if len(optVal) < linux.SizeOfIP6TEntry {
+ nflog("optVal has insufficient size for entry %d", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var entry linux.IP6TEntry
+ buf := optVal[:linux.SizeOfIP6TEntry]
+ binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+ initialOptValLen := len(optVal)
+ optVal = optVal[linux.SizeOfIP6TEntry:]
+
+ if entry.TargetOffset < linux.SizeOfIP6TEntry {
+ nflog("entry has too-small target offset %d", entry.TargetOffset)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): We should support more IPTIP
+ // filtering fields.
+ filter, err := filterFromIP6TIP(entry.IPv6)
+ if err != nil {
+ nflog("bad iptip: %v", err)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Matchers and targets can specify
+ // that they only work for certain protocols, hooks, tables.
+ // Get matchers.
+ matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry
+ if len(optVal) < int(matchersSize) {
+ nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ matchers, err := parseMatchers(filter, optVal[:matchersSize])
+ if err != nil {
+ nflog("failed to parse matchers: %v", err)
+ return nil, syserr.ErrInvalidArgument
+ }
+ optVal = optVal[matchersSize:]
+
+ // Get the target of the rule.
+ targetSize := entry.NextOffset - entry.TargetOffset
+ if len(optVal) < int(targetSize) {
+ nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ rule := stack.Rule{
+ Filter: filter,
+ Matchers: matchers,
+ }
+
+ {
+ target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */)
+ if err != nil {
+ nflog("failed to parse target: %v", err)
+ return nil, err
+ }
+ rule.Target = target
+ }
+ optVal = optVal[targetSize:]
+
+ table.Rules = append(table.Rules, rule)
+ offsets[offset] = int(entryIdx)
+ offset += uint32(entry.NextOffset)
+
+ if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+ nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ }
+ return offsets, nil
+}
+
+func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) {
+ if containsUnsupportedFields6(iptip) {
+ return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+ }
+ if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize {
+ return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+ }
+ if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize {
+ return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+ }
+
+ n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+ if n == -1 {
+ n = len(iptip.OutputInterface)
+ }
+ ifname := string(iptip.OutputInterface[:n])
+
+ n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+ if n == -1 {
+ n = len(iptip.OutputInterfaceMask)
+ }
+ ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+ return stack.IPHeaderFilter{
+ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+ // In ip6tables a flag controls whether to check the protocol.
+ CheckProtocol: iptip.Flags&linux.IP6T_F_PROTO != 0,
+ Dst: tcpip.Address(iptip.Dst[:]),
+ DstMask: tcpip.Address(iptip.DstMask[:]),
+ DstInvert: iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0,
+ Src: tcpip.Address(iptip.Src[:]),
+ SrcMask: tcpip.Address(iptip.SrcMask[:]),
+ SrcInvert: iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0,
+ OutputInterface: ifname,
+ OutputInterfaceMask: ifnameMask,
+ OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0,
+ }, nil
+}
+
+func containsUnsupportedFields6(iptip linux.IP6TIP) bool {
+ // The following features are supported:
+ // - Protocol
+ // - Dst and DstMask
+ // - Src and SrcMask
+ // - The inverse destination IP check flag
+ // - OutputInterface, OutputInterfaceMask and its inverse.
+ var emptyInterface = [linux.IFNAMSIZ]byte{}
+ flagMask := uint8(linux.IP6T_F_PROTO)
+ // Disable any supported inverse flags.
+ inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT)
+ return iptip.InputInterface != emptyInterface ||
+ iptip.InputInterfaceMask != emptyInterface ||
+ iptip.Flags&^flagMask != 0 ||
+ iptip.InverseFlags&^inverseMask != 0 ||
+ iptip.TOS != 0
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 47ff48c00..904a12e38 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,7 +17,6 @@
package netfilter
import (
- "bytes"
"errors"
"fmt"
@@ -27,47 +26,15 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
- "gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/usermem"
)
-// errorTargetName is used to mark targets as error targets. Error targets
-// shouldn't be reached - an error has occurred if we fall through to one.
-const errorTargetName = "ERROR"
-
-// redirectTargetName is used to mark targets as redirect targets. Redirect
-// targets should be reached for only NAT and Mangle tables. These targets will
-// change the destination port/destination IP for packets.
-const redirectTargetName = "REDIRECT"
-
-// Metadata is used to verify that we are correctly serializing and
-// deserializing iptables into structs consumable by the iptables tool. We save
-// a metadata struct when the tables are written, and when they are read out we
-// verify that certain fields are the same.
-//
-// metadata is used by this serialization/deserializing code, not netstack.
-type metadata struct {
- HookEntry [linux.NF_INET_NUMHOOKS]uint32
- Underflow [linux.NF_INET_NUMHOOKS]uint32
- NumEntries uint32
- Size uint32
-}
-
// enableLogging controls whether to log the (de)serialization of netfilter
// structs between userspace and netstack. These logs are useful when
// developing iptables, but can pollute sentry logs otherwise.
const enableLogging = false
-// emptyFilter is for comparison with a rule's filters to determine whether it
-// is also empty. It is immutable.
-var emptyFilter = stack.IPHeaderFilter{
- Dst: "\x00\x00\x00\x00",
- DstMask: "\x00\x00\x00\x00",
- Src: "\x00\x00\x00\x00",
- SrcMask: "\x00\x00\x00\x00",
-}
-
// nflog logs messages related to the writing and reading of iptables.
func nflog(format string, args ...interface{}) {
if enableLogging && log.IsLogging(log.Debug) {
@@ -76,65 +43,44 @@ func nflog(format string, args ...interface{}) {
}
// GetInfo returns information about iptables.
-func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
// Read in the struct and table name.
var info linux.IPTGetinfo
- if _, err := t.CopyIn(outPtr, &info); err != nil {
+ if _, err := info.CopyIn(t, outPtr); err != nil {
return linux.IPTGetinfo{}, syserr.FromError(err)
}
- // Find the appropriate table.
- table, err := findTable(stack, info.Name)
+ var err error
+ if ipv6 {
+ _, info, err = convertNetstackToBinary6(stack, info.Name)
+ } else {
+ _, info, err = convertNetstackToBinary4(stack, info.Name)
+ }
if err != nil {
- nflog("%v", err)
+ nflog("couldn't convert iptables: %v", err)
return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
}
- // Get the hooks that apply to this table.
- info.ValidHooks = table.ValidHooks()
-
- // Grab the metadata struct, which is used to store information (e.g.
- // the number of entries) that applies to the user's encoding of
- // iptables, but not netstack's.
- metadata := table.Metadata().(metadata)
-
- // Set values from metadata.
- info.HookEntry = metadata.HookEntry
- info.Underflow = metadata.Underflow
- info.NumEntries = metadata.NumEntries
- info.Size = metadata.Size
-
nflog("returning info: %+v", info)
-
return info, nil
}
-// GetEntries returns netstack's iptables rules encoded for the iptables tool.
-func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+// GetEntries4 returns netstack's iptables rules.
+func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
// Read in the struct and table name.
var userEntries linux.IPTGetEntries
- if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+ if _, err := userEntries.CopyIn(t, outPtr); err != nil {
nflog("couldn't copy in entries %q", userEntries.Name)
return linux.KernelIPTGetEntries{}, syserr.FromError(err)
}
- // Find the appropriate table.
- table, err := findTable(stack, userEntries.Name)
- if err != nil {
- nflog("%v", err)
- return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
- }
-
// Convert netstack's iptables rules to something that the iptables
// tool can understand.
- entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
+ entries, _, err := convertNetstackToBinary4(stack, userEntries.Name)
if err != nil {
nflog("couldn't read entries: %v", err)
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
}
- if meta != table.Metadata().(metadata) {
- panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
- }
if binary.Size(entries) > uintptr(outLen) {
nflog("insufficient GetEntries output size: %d", uintptr(outLen))
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -143,258 +89,53 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
return entries, nil
}
-func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
- ipt := stk.IPTables()
- table, ok := ipt.Tables[tablename.String()]
- if !ok {
- return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
- }
- return table, nil
-}
-
-// FillDefaultIPTables sets stack's IPTables to the default tables and
-// populates them with metadata.
-func FillDefaultIPTables(stk *stack.Stack) {
- ipt := stack.DefaultTables()
-
- // In order to fill in the metadata, we have to translate ipt from its
- // netstack format to Linux's giant-binary-blob format.
- for name, table := range ipt.Tables {
- _, metadata, err := convertNetstackToBinary(name, table)
- if err != nil {
- panic(fmt.Errorf("Unable to set default IP tables: %v", err))
- }
- table.SetMetadata(metadata)
- ipt.Tables[name] = table
- }
-
- stk.SetIPTables(ipt)
-}
-
-// convertNetstackToBinary converts the iptables as stored in netstack to the
-// format expected by the iptables tool. Linux stores each table as a binary
-// blob that can only be traversed by parsing a bit, reading some offsets,
-// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
- // Return values.
- var entries linux.KernelIPTGetEntries
- var meta metadata
-
- // The table name has to fit in the struct.
- if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
- return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("table name %q too long.", tablename)
- }
- copy(entries.Name[:], tablename)
-
- for ruleIdx, rule := range table.Rules {
- nflog("convert to binary: current offset: %d", entries.Size)
-
- // Is this a chain entry point?
- for hook, hookRuleIdx := range table.BuiltinChains {
- if hookRuleIdx == ruleIdx {
- nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
- meta.HookEntry[hook] = entries.Size
- }
- }
- // Is this a chain underflow point?
- for underflow, underflowRuleIdx := range table.Underflows {
- if underflowRuleIdx == ruleIdx {
- nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
- meta.Underflow[underflow] = entries.Size
- }
- }
-
- // Each rule corresponds to an entry.
- entry := linux.KernelIPTEntry{
- IPTEntry: linux.IPTEntry{
- IP: linux.IPTIP{
- Protocol: uint16(rule.Filter.Protocol),
- },
- NextOffset: linux.SizeOfIPTEntry,
- TargetOffset: linux.SizeOfIPTEntry,
- },
- }
- copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst)
- copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask)
- copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src)
- copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask)
- copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface)
- copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
- if rule.Filter.DstInvert {
- entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP
- }
- if rule.Filter.SrcInvert {
- entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP
- }
- if rule.Filter.OutputInterfaceInvert {
- entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
- }
-
- for _, matcher := range rule.Matchers {
- // Serialize the matcher and add it to the
- // entry.
- serialized := marshalMatcher(matcher)
- nflog("convert to binary: matcher serialized as: %v", serialized)
- if len(serialized)%8 != 0 {
- panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
- }
- entry.Elems = append(entry.Elems, serialized...)
- entry.NextOffset += uint16(len(serialized))
- entry.TargetOffset += uint16(len(serialized))
- }
-
- // Serialize and append the target.
- serialized := marshalTarget(rule.Target)
- if len(serialized)%8 != 0 {
- panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
- }
- entry.Elems = append(entry.Elems, serialized...)
- entry.NextOffset += uint16(len(serialized))
-
- nflog("convert to binary: adding entry: %+v", entry)
-
- entries.Size += uint32(entry.NextOffset)
- entries.Entrytable = append(entries.Entrytable, entry)
- meta.NumEntries++
- }
-
- nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
- meta.Size = entries.Size
- return entries, meta, nil
-}
-
-func marshalTarget(target stack.Target) []byte {
- switch tg := target.(type) {
- case stack.AcceptTarget:
- return marshalStandardTarget(stack.RuleAccept)
- case stack.DropTarget:
- return marshalStandardTarget(stack.RuleDrop)
- case stack.ErrorTarget:
- return marshalErrorTarget(errorTargetName)
- case stack.UserChainTarget:
- return marshalErrorTarget(tg.Name)
- case stack.ReturnTarget:
- return marshalStandardTarget(stack.RuleReturn)
- case stack.RedirectTarget:
- return marshalRedirectTarget(tg)
- case JumpTarget:
- return marshalJumpTarget(tg)
- default:
- panic(fmt.Errorf("unknown target of type %T", target))
- }
-}
-
-func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
- nflog("convert to binary: marshalling standard target")
-
- // The target's name will be the empty string.
- target := linux.XTStandardTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTStandardTarget,
- },
- Verdict: translateFromStandardVerdict(verdict),
- }
-
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalErrorTarget(errorName string) []byte {
- // This is an error target named error
- target := linux.XTErrorTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTErrorTarget,
- },
- }
- copy(target.Name[:], errorName)
- copy(target.Target.Name[:], errorTargetName)
-
- ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
- // This is a redirect target named redirect
- target := linux.XTRedirectTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTRedirectTarget,
- },
+// GetEntries6 returns netstack's ip6tables rules.
+func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) {
+ // Read in the struct and table name. IPv4 and IPv6 utilize structs
+ // with the same layout.
+ var userEntries linux.IPTGetEntries
+ if _, err := userEntries.CopyIn(t, outPtr); err != nil {
+ nflog("couldn't copy in entries %q", userEntries.Name)
+ return linux.KernelIP6TGetEntries{}, syserr.FromError(err)
}
- copy(target.Target.Name[:], redirectTargetName)
- ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
- target.NfRange.RangeSize = 1
- if rt.RangeProtoSpecified {
- target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+ // Convert netstack's iptables rules to something that the iptables
+ // tool can understand.
+ entries, _, err := convertNetstackToBinary6(stack, userEntries.Name)
+ if err != nil {
+ nflog("couldn't read entries: %v", err)
+ return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
}
- // Convert port from little endian to big endian.
- port := make([]byte, 2)
- binary.LittleEndian.PutUint16(port, rt.MinPort)
- target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
- binary.LittleEndian.PutUint16(port, rt.MaxPort)
- target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalJumpTarget(jt JumpTarget) []byte {
- nflog("convert to binary: marshalling jump target")
-
- // The target's name will be the empty string.
- target := linux.XTStandardTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTStandardTarget,
- },
- // Verdict is overloaded by the ABI. When positive, it holds
- // the jump offset from the start of the table.
- Verdict: int32(jt.Offset),
+ if binary.Size(entries) > uintptr(outLen) {
+ nflog("insufficient GetEntries output size: %d", uintptr(outLen))
+ return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
}
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
+ return entries, nil
}
-// translateFromStandardVerdict translates verdicts the same way as the iptables
-// tool.
-func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
- switch verdict {
- case stack.RuleAccept:
- return -linux.NF_ACCEPT - 1
- case stack.RuleDrop:
- return -linux.NF_DROP - 1
- case stack.RuleReturn:
- return linux.NF_RETURN
- default:
- // TODO(gvisor.dev/issue/170): Support Jump.
- panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
+// or underflow, in which case it fills in info.HookEntry and info.Underflows.
+func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) {
+ // Is this a chain entry point?
+ for hook, hookRuleIdx := range table.BuiltinChains {
+ if hookRuleIdx == ruleIdx {
+ nflog("convert to binary: found hook %d at offset %d", hook, offset)
+ info.HookEntry[hook] = offset
+ }
}
-}
-
-// translateToStandardTarget translates from the value in a
-// linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32) (stack.Target, error) {
- // TODO(gvisor.dev/issue/170): Support other verdicts.
- switch val {
- case -linux.NF_ACCEPT - 1:
- return stack.AcceptTarget{}, nil
- case -linux.NF_DROP - 1:
- return stack.DropTarget{}, nil
- case -linux.NF_QUEUE - 1:
- return nil, errors.New("unsupported iptables verdict QUEUE")
- case linux.NF_RETURN:
- return stack.ReturnTarget{}, nil
- default:
- return nil, fmt.Errorf("unknown iptables verdict %d", val)
+ // Is this a chain underflow point?
+ for underflow, underflowRuleIdx := range table.Underflows {
+ if underflowRuleIdx == ruleIdx {
+ nflog("convert to binary: found underflow %d at offset %d", underflow, offset)
+ info.Underflow[underflow] = offset
+ }
}
}
// SetEntries sets iptables rules for a single table. See
// net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
- // Get the basic rules data (struct ipt_replace).
- if len(optVal) < linux.SizeOfIPTReplace {
- nflog("optVal has insufficient size for replace %d", len(optVal))
- return syserr.ErrInvalidArgument
- }
+func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
var replace linux.IPTReplace
replaceBuf := optVal[:linux.SizeOfIPTReplace]
optVal = optVal[linux.SizeOfIPTReplace:]
@@ -403,88 +144,24 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// TODO(gvisor.dev/issue/170): Support other tables.
var table stack.Table
switch replace.Name.String() {
- case stack.TablenameFilter:
+ case stack.FilterTable:
table = stack.EmptyFilterTable()
- case stack.TablenameNat:
- table = stack.EmptyNatTable()
+ case stack.NATTable:
+ table = stack.EmptyNATTable()
default:
nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
return syserr.ErrInvalidArgument
}
- nflog("set entries: setting entries in table %q", replace.Name.String())
-
- // Convert input into a list of rules and their offsets.
- var offset uint32
- // offsets maps rule byte offsets to their position in table.Rules.
- offsets := map[uint32]int{}
- for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
- nflog("set entries: processing entry at offset %d", offset)
-
- // Get the struct ipt_entry.
- if len(optVal) < linux.SizeOfIPTEntry {
- nflog("optVal has insufficient size for entry %d", len(optVal))
- return syserr.ErrInvalidArgument
- }
- var entry linux.IPTEntry
- buf := optVal[:linux.SizeOfIPTEntry]
- binary.Unmarshal(buf, usermem.ByteOrder, &entry)
- initialOptValLen := len(optVal)
- optVal = optVal[linux.SizeOfIPTEntry:]
-
- if entry.TargetOffset < linux.SizeOfIPTEntry {
- nflog("entry has too-small target offset %d", entry.TargetOffset)
- return syserr.ErrInvalidArgument
- }
-
- // TODO(gvisor.dev/issue/170): We should support more IPTIP
- // filtering fields.
- filter, err := filterFromIPTIP(entry.IP)
- if err != nil {
- nflog("bad iptip: %v", err)
- return syserr.ErrInvalidArgument
- }
-
- // TODO(gvisor.dev/issue/170): Matchers and targets can specify
- // that they only work for certain protocols, hooks, tables.
- // Get matchers.
- matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
- if len(optVal) < int(matchersSize) {
- nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
- return syserr.ErrInvalidArgument
- }
- matchers, err := parseMatchers(filter, optVal[:matchersSize])
- if err != nil {
- nflog("failed to parse matchers: %v", err)
- return syserr.ErrInvalidArgument
- }
- optVal = optVal[matchersSize:]
-
- // Get the target of the rule.
- targetSize := entry.NextOffset - entry.TargetOffset
- if len(optVal) < int(targetSize) {
- nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
- return syserr.ErrInvalidArgument
- }
- target, err := parseTarget(filter, optVal[:targetSize])
- if err != nil {
- nflog("failed to parse target: %v", err)
- return syserr.ErrInvalidArgument
- }
- optVal = optVal[targetSize:]
-
- table.Rules = append(table.Rules, stack.Rule{
- Filter: filter,
- Target: target,
- Matchers: matchers,
- })
- offsets[offset] = int(entryIdx)
- offset += uint32(entry.NextOffset)
-
- if initialOptValLen-len(optVal) != int(entry.NextOffset) {
- nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
- return syserr.ErrInvalidArgument
- }
+ var err *syserr.Error
+ var offsets map[uint32]int
+ if ipv6 {
+ offsets, err = modifyEntries6(stk, optVal, &replace, &table)
+ } else {
+ offsets, err = modifyEntries4(stk, optVal, &replace, &table)
+ }
+ if err != nil {
+ return err
}
// Go through the list of supported hooks for this table and, for each
@@ -492,12 +169,14 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
for hook, _ := range replace.HookEntry {
if table.ValidHooks()&(1<<hook) != 0 {
hk := hookFromLinux(hook)
+ table.BuiltinChains[hk] = stack.HookUnset
+ table.Underflows[hk] = stack.HookUnset
for offset, ruleIdx := range offsets {
if offset == replace.HookEntry[hook] {
table.BuiltinChains[hk] = ruleIdx
}
if offset == replace.Underflow[hook] {
- if !validUnderflow(table.Rules[ruleIdx]) {
+ if !validUnderflow(table.Rules[ruleIdx], ipv6) {
nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
return syserr.ErrInvalidArgument
}
@@ -515,10 +194,9 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
}
}
- // Add the user chains.
+ // Check the user chains.
for ruleIdx, rule := range table.Rules {
- target, ok := rule.Target.(stack.UserChainTarget)
- if !ok {
+ if _, ok := rule.Target.(*stack.UserChainTarget); !ok {
continue
}
@@ -534,13 +212,12 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
nflog("user chain's first node must have no matchers")
return syserr.ErrInvalidArgument
}
- table.UserChains[target.Name] = ruleIdx + 1
}
// Set each jump to point to the appropriate rule. Right now they hold byte
// offsets.
for ruleIdx, rule := range table.Rules {
- jump, ok := rule.Target.(JumpTarget)
+ jump, ok := rule.Target.(*JumpTarget)
if !ok {
continue
}
@@ -560,8 +237,11 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now,
// make sure all other chains point to ACCEPT rules.
for hook, ruleIdx := range table.BuiltinChains {
- if hook == stack.Forward || hook == stack.Postrouting {
- if !isUnconditionalAccept(table.Rules[ruleIdx]) {
+ if hook := stack.Hook(hook); hook == stack.Forward || hook == stack.Postrouting {
+ if ruleIdx == stack.HookUnset {
+ continue
+ }
+ if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) {
nflog("hook %d is unsupported.", hook)
return syserr.ErrInvalidArgument
}
@@ -573,17 +253,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// - There are no chains without an unconditional final rule.
// - There are no chains without an unconditional underflow rule.
- ipt := stk.IPTables()
- table.SetMetadata(metadata{
- HookEntry: replace.HookEntry,
- Underflow: replace.Underflow,
- NumEntries: replace.NumEntries,
- Size: replace.Size,
- })
- ipt.Tables[replace.Name.String()] = table
- stk.SetIPTables(ipt)
-
- return nil
+ return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6))
+
}
// parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -605,7 +276,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
// Check some invariants.
if match.MatchSize < linux.SizeOfXTEntryMatch {
-
return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
}
if len(optVal) < int(match.MatchSize) {
@@ -630,186 +300,26 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
return matchers, nil
}
-// parseTarget parses a target from optVal. optVal should contain only the
-// target.
-func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
- nflog("set entries: parsing target of size %d", len(optVal))
- if len(optVal) < linux.SizeOfXTEntryTarget {
- return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
- }
- var target linux.XTEntryTarget
- buf := optVal[:linux.SizeOfXTEntryTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &target)
- switch target.Name.String() {
- case "":
- // Standard target.
- if len(optVal) != linux.SizeOfXTStandardTarget {
- return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
- }
- var standardTarget linux.XTStandardTarget
- buf = optVal[:linux.SizeOfXTStandardTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
-
- if standardTarget.Verdict < 0 {
- // A Verdict < 0 indicates a non-jump verdict.
- return translateToStandardTarget(standardTarget.Verdict)
- }
- // A verdict >= 0 indicates a jump.
- return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
-
- case errorTargetName:
- // Error target.
- if len(optVal) != linux.SizeOfXTErrorTarget {
- return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
- }
- var errorTarget linux.XTErrorTarget
- buf = optVal[:linux.SizeOfXTErrorTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
-
- // Error targets are used in 2 cases:
- // * An actual error case. These rules have an error
- // named errorTargetName. The last entry of the table
- // is usually an error case to catch any packets that
- // somehow fall through every rule.
- // * To mark the start of a user defined chain. These
- // rules have an error with the name of the chain.
- switch name := errorTarget.Name.String(); name {
- case errorTargetName:
- nflog("set entries: error target")
- return stack.ErrorTarget{}, nil
- default:
- // User defined chain.
- nflog("set entries: user-defined target %q", name)
- return stack.UserChainTarget{Name: name}, nil
- }
-
- case redirectTargetName:
- // Redirect target.
- if len(optVal) < linux.SizeOfXTRedirectTarget {
- return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
- }
-
- if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- var redirectTarget linux.XTRedirectTarget
- buf = optVal[:linux.SizeOfXTRedirectTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
-
- // Copy linux.XTRedirectTarget to stack.RedirectTarget.
- var target stack.RedirectTarget
- nfRange := redirectTarget.NfRange
-
- // RangeSize should be 1.
- if nfRange.RangeSize != 1 {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- // TODO(gvisor.dev/issue/170): Check if the flags are valid.
- // Also check if we need to map ports or IP.
- // For now, redirect target only supports destination port change.
- // Port range and IP range are not supported yet.
- if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
- target.RangeProtoSpecified = true
-
- target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
- target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
-
- // TODO(gvisor.dev/issue/170): Port range is not supported yet.
- if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- // Convert port from big endian to little endian.
- port := make([]byte, 2)
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
- target.MinPort = binary.LittleEndian.Uint16(port)
-
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
- target.MaxPort = binary.LittleEndian.Uint16(port)
- return target, nil
- }
-
- // Unknown target.
- return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-}
-
-func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
- if containsUnsupportedFields(iptip) {
- return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
- }
- if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
- return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
- }
- if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
- return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
- }
-
- n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
- if n == -1 {
- n = len(iptip.OutputInterface)
- }
- ifname := string(iptip.OutputInterface[:n])
-
- n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
- if n == -1 {
- n = len(iptip.OutputInterfaceMask)
- }
- ifnameMask := string(iptip.OutputInterfaceMask[:n])
-
- return stack.IPHeaderFilter{
- Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
- Dst: tcpip.Address(iptip.Dst[:]),
- DstMask: tcpip.Address(iptip.DstMask[:]),
- DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
- Src: tcpip.Address(iptip.Src[:]),
- SrcMask: tcpip.Address(iptip.SrcMask[:]),
- SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
- OutputInterface: ifname,
- OutputInterfaceMask: ifnameMask,
- OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
- }, nil
-}
-
-func containsUnsupportedFields(iptip linux.IPTIP) bool {
- // The following features are supported:
- // - Protocol
- // - Dst and DstMask
- // - Src and SrcMask
- // - The inverse destination IP check flag
- // - OutputInterface, OutputInterfaceMask and its inverse.
- var emptyInterface = [linux.IFNAMSIZ]byte{}
- // Disable any supported inverse flags.
- inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
- return iptip.InputInterface != emptyInterface ||
- iptip.InputInterfaceMask != emptyInterface ||
- iptip.Flags != 0 ||
- iptip.InverseFlags&^inverseMask != 0
-}
-
-func validUnderflow(rule stack.Rule) bool {
+func validUnderflow(rule stack.Rule, ipv6 bool) bool {
if len(rule.Matchers) != 0 {
return false
}
- if rule.Filter != emptyFilter {
+ if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) {
return false
}
switch rule.Target.(type) {
- case stack.AcceptTarget, stack.DropTarget:
+ case *stack.AcceptTarget, *stack.DropTarget:
return true
default:
return false
}
}
-func isUnconditionalAccept(rule stack.Rule) bool {
- if !validUnderflow(rule) {
+func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
+ if !validUnderflow(rule, ipv6) {
return false
}
- _, ok := rule.Target.(stack.AcceptTarget)
+ _, ok := rule.Target.(*stack.AcceptTarget)
return ok
}
@@ -828,3 +338,20 @@ func hookFromLinux(hook int) stack.Hook {
}
panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
}
+
+// TargetRevision returns a linux.XTGetRevision for a given target. It sets
+// Revision to the highest supported value, unless the provided revision number
+// is larger.
+func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) {
+ // Read in the target name and version.
+ var rev linux.XTGetRevision
+ if _, err := rev.CopyIn(t, revPtr); err != nil {
+ return linux.XTGetRevision{}, syserr.FromError(err)
+ }
+ maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision)
+ if !ok {
+ return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported
+ }
+ rev.Revision = maxSupported
+ return rev, nil
+}
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index 3863293c7..1b4e0ad79 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
// Support only for OUTPUT chain.
// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
if hook != stack.Output {
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 84abe8d29..0e14447fe 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,10 +15,359 @@
package netfilter
import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/usermem"
)
+func init() {
+ // Standard targets include ACCEPT, DROP, RETURN, and JUMP.
+ registerTargetMaker(&standardTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&standardTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+
+ // Both user chains and actual errors are represented in iptables by
+ // error targets.
+ registerTargetMaker(&errorTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&errorTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+
+ registerTargetMaker(&redirectTargetMaker{
+ NetworkProtocol: header.IPv4ProtocolNumber,
+ })
+ registerTargetMaker(&nfNATTargetMaker{
+ NetworkProtocol: header.IPv6ProtocolNumber,
+ })
+}
+
+type standardTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (sm *standardTargetMaker) id() stack.TargetID {
+ // Standard targets have the empty string as a name and no revisions.
+ return stack.TargetID{
+ NetworkProtocol: sm.NetworkProtocol,
+ }
+}
+func (*standardTargetMaker) marshal(target stack.Target) []byte {
+ // Translate verdicts the same way as the iptables tool.
+ var verdict int32
+ switch tg := target.(type) {
+ case *stack.AcceptTarget:
+ verdict = -linux.NF_ACCEPT - 1
+ case *stack.DropTarget:
+ verdict = -linux.NF_DROP - 1
+ case *stack.ReturnTarget:
+ verdict = linux.NF_RETURN
+ case *JumpTarget:
+ verdict = int32(tg.Offset)
+ default:
+ panic(fmt.Errorf("unknown target of type %T", target))
+ }
+
+ // The target's name will be the empty string.
+ xt := linux.XTStandardTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTStandardTarget,
+ },
+ Verdict: verdict,
+ }
+
+ ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) != linux.SizeOfXTStandardTarget {
+ nflog("buf has wrong size for standard target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var standardTarget linux.XTStandardTarget
+ buf = buf[:linux.SizeOfXTStandardTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+ if standardTarget.Verdict < 0 {
+ // A Verdict < 0 indicates a non-jump verdict.
+ return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol())
+ }
+ // A verdict >= 0 indicates a jump.
+ return &JumpTarget{
+ Offset: uint32(standardTarget.Verdict),
+ NetworkProtocol: filter.NetworkProtocol(),
+ }, nil
+}
+
+type errorTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (em *errorTargetMaker) id() stack.TargetID {
+ // Error targets have no revision.
+ return stack.TargetID{
+ Name: stack.ErrorTargetName,
+ NetworkProtocol: em.NetworkProtocol,
+ }
+}
+
+func (*errorTargetMaker) marshal(target stack.Target) []byte {
+ var errorName string
+ switch tg := target.(type) {
+ case *stack.ErrorTarget:
+ errorName = stack.ErrorTargetName
+ case *stack.UserChainTarget:
+ errorName = tg.Name
+ default:
+ panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
+ }
+
+ // This is an error target named error
+ xt := linux.XTErrorTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTErrorTarget,
+ },
+ }
+ copy(xt.Name[:], errorName)
+ copy(xt.Target.Name[:], stack.ErrorTargetName)
+
+ ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) != linux.SizeOfXTErrorTarget {
+ nflog("buf has insufficient size for error target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var errorTarget linux.XTErrorTarget
+ buf = buf[:linux.SizeOfXTErrorTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+ // Error targets are used in 2 cases:
+ // * An actual error case. These rules have an error
+ // named stack.ErrorTargetName. The last entry of the table
+ // is usually an error case to catch any packets that
+ // somehow fall through every rule.
+ // * To mark the start of a user defined chain. These
+ // rules have an error with the name of the chain.
+ switch name := errorTarget.Name.String(); name {
+ case stack.ErrorTargetName:
+ return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil
+ default:
+ // User defined chain.
+ return &stack.UserChainTarget{
+ Name: name,
+ NetworkProtocol: filter.NetworkProtocol(),
+ }, nil
+ }
+}
+
+type redirectTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *redirectTargetMaker) id() stack.TargetID {
+ return stack.TargetID{
+ Name: stack.RedirectTargetName,
+ NetworkProtocol: rm.NetworkProtocol,
+ }
+}
+
+func (*redirectTargetMaker) marshal(target stack.Target) []byte {
+ rt := target.(*stack.RedirectTarget)
+ // This is a redirect target named redirect
+ xt := linux.XTRedirectTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTRedirectTarget,
+ },
+ }
+ copy(xt.Target.Name[:], stack.RedirectTargetName)
+
+ ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
+ xt.NfRange.RangeSize = 1
+ xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+ xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
+ xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
+ return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if len(buf) < linux.SizeOfXTRedirectTarget {
+ nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+ nflog("redirectTargetMaker: bad proto %d", p)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var redirectTarget linux.XTRedirectTarget
+ buf = buf[:linux.SizeOfXTRedirectTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+ // Copy linux.XTRedirectTarget to stack.RedirectTarget.
+ target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()}
+
+ // RangeSize should be 1.
+ nfRange := redirectTarget.NfRange
+ if nfRange.RangeSize != 1 {
+ nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Check if the flags are valid.
+ // Also check if we need to map ports or IP.
+ // For now, redirect target only supports destination port change.
+ // Port range and IP range are not supported yet.
+ if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+ nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): Port range is not supported yet.
+ if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+ nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+ return nil, syserr.ErrInvalidArgument
+ }
+ if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
+ nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+ target.Port = ntohs(nfRange.RangeIPV4.MinPort)
+
+ return &target, nil
+}
+
+type nfNATTarget struct {
+ Target linux.XTEntryTarget
+ Range linux.NFNATRange
+}
+
+const nfNATMarhsalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange
+
+type nfNATTargetMaker struct {
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *nfNATTargetMaker) id() stack.TargetID {
+ return stack.TargetID{
+ Name: stack.RedirectTargetName,
+ NetworkProtocol: rm.NetworkProtocol,
+ }
+}
+
+func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
+ rt := target.(*stack.RedirectTarget)
+ nt := nfNATTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: nfNATMarhsalledSize,
+ },
+ Range: linux.NFNATRange{
+ Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
+ },
+ }
+ copy(nt.Target.Name[:], stack.RedirectTargetName)
+ copy(nt.Range.MinAddr[:], rt.Addr)
+ copy(nt.Range.MaxAddr[:], rt.Addr)
+
+ nt.Range.MinProto = htons(rt.Port)
+ nt.Range.MaxProto = nt.Range.MinProto
+
+ ret := make([]byte, 0, nfNATMarhsalledSize)
+ return binary.Marshal(ret, usermem.ByteOrder, nt)
+}
+
+func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+ if size := nfNATMarhsalledSize; len(buf) < size {
+ nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+ nflog("nfNATTargetMaker: bad proto %d", p)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var natRange linux.NFNATRange
+ buf = buf[linux.SizeOfXTEntryTarget:nfNATMarhsalledSize]
+ binary.Unmarshal(buf, usermem.ByteOrder, &natRange)
+
+ // We don't support port or address ranges.
+ if natRange.MinAddr != natRange.MaxAddr {
+ nflog("nfNATTargetMaker: MinAddr and MaxAddr are different")
+ return nil, syserr.ErrInvalidArgument
+ }
+ if natRange.MinProto != natRange.MaxProto {
+ nflog("nfNATTargetMaker: MinProto and MaxProto are different")
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/3549): Check for other flags.
+ // For now, redirect target only supports destination change.
+ if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+ nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags)
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ target := stack.RedirectTarget{
+ NetworkProtocol: filter.NetworkProtocol(),
+ Addr: tcpip.Address(natRange.MinAddr[:]),
+ Port: ntohs(natRange.MinProto),
+ }
+
+ return &target, nil
+}
+
+// translateToStandardTarget translates from the value in a
+// linux.XTStandardTarget to an stack.Verdict.
+func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) {
+ // TODO(gvisor.dev/issue/170): Support other verdicts.
+ switch val {
+ case -linux.NF_ACCEPT - 1:
+ return &stack.AcceptTarget{NetworkProtocol: netProto}, nil
+ case -linux.NF_DROP - 1:
+ return &stack.DropTarget{NetworkProtocol: netProto}, nil
+ case -linux.NF_QUEUE - 1:
+ nflog("unsupported iptables verdict QUEUE")
+ return nil, syserr.ErrInvalidArgument
+ case linux.NF_RETURN:
+ return &stack.ReturnTarget{NetworkProtocol: netProto}, nil
+ default:
+ nflog("unknown iptables verdict %d", val)
+ return nil, syserr.ErrInvalidArgument
+ }
+}
+
+// parseTarget parses a target from optVal. optVal should contain only the
+// target.
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) {
+ nflog("set entries: parsing target of size %d", len(optVal))
+ if len(optVal) < linux.SizeOfXTEntryTarget {
+ nflog("optVal has insufficient size for entry target %d", len(optVal))
+ return nil, syserr.ErrInvalidArgument
+ }
+ var target linux.XTEntryTarget
+ buf := optVal[:linux.SizeOfXTEntryTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &target)
+
+ return unmarshalTarget(target, filter, optVal)
+}
+
// JumpTarget implements stack.Target.
type JumpTarget struct {
// Offset is the byte offset of the rule to jump to. It is used for
@@ -27,9 +376,31 @@ type JumpTarget struct {
// RuleNum is the rule to jump to.
RuleNum int
+
+ // NetworkProtocol is the network protocol the target is used with.
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (jt *JumpTarget) ID() stack.TargetID {
+ return stack.TargetID{
+ NetworkProtocol: jt.NetworkProtocol,
+ }
}
// Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
return stack.RuleJump, jt.RuleNum
}
+
+func ntohs(port uint16) uint16 {
+ buf := make([]byte, 2)
+ binary.BigEndian.PutUint16(buf, port)
+ return usermem.ByteOrder.Uint16(buf)
+}
+
+func htons(port uint16) uint16 {
+ buf := make([]byte, 2)
+ usermem.ByteOrder.PutUint16(buf, port)
+ return binary.BigEndian.Uint16(buf)
+}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 57a1e1c12..352c51390 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -71,7 +71,7 @@ func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
}
if filter.Protocol != header.TCPProtocolNumber {
- return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+ return nil, fmt.Errorf("TCP matching is only valid for protocol %d", header.TCPProtocolNumber)
}
return &TCPMatcher{
@@ -96,51 +96,41 @@ func (*TCPMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader)
-
- if netHeader.TransportProtocol() != header.TCPProtocolNumber {
- return false, false
- }
-
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
+ // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+ // into the stack.Check codepath as matchers are added.
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
}
- return false, false
- }
- // Now we need the transport header. However, this may not have been set
- // yet.
- // TODO(gvisor.dev/issue/170): Parsing the transport header should
- // ultimately be moved into the stack.Check codepath as matchers are
- // added.
- var tcpHeader header.TCP
- if pkt.TransportHeader != nil {
- tcpHeader = header.TCP(pkt.TransportHeader)
- } else {
- var length int
- if hook == stack.Prerouting {
- // The network header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
- if !ok {
- // There's no valid TCP header here, so we hotdrop the
- // packet.
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
return false, true
}
- h := header.IPv4(hdr)
- pkt.NetworkHeader = hdr
- length = int(h.HeaderLength())
+ return false, false
}
- // The TCP header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize)
- if !ok {
- // There's no valid TCP header here, so we hotdrop the
- // packet.
- return false, true
+
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
}
- tcpHeader = header.TCP(hdr[length:])
+
+ default:
+ // We don't know the network protocol.
+ return false, false
+ }
+
+ tcpHeader := header.TCP(pkt.TransportHeader().View())
+ if len(tcpHeader) < header.TCPMinimumSize {
+ // There's no valid TCP header here, so we drop the packet immediately.
+ return false, true
}
// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index cfa9e621d..c88d8268d 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -68,7 +68,7 @@ func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
}
if filter.Protocol != header.UDPProtocolNumber {
- return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+ return nil, fmt.Errorf("UDP matching is only valid for protocol %d", header.UDPProtocolNumber)
}
return &UDPMatcher{
@@ -93,53 +93,41 @@ func (*UDPMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader)
-
+func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
// into the stack.Check codepath as matchers are added.
- if netHeader.TransportProtocol() != header.UDPProtocolNumber {
- return false, false
- }
-
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
}
- return false, false
- }
- // Now we need the transport header. However, this may not have been set
- // yet.
- // TODO(gvisor.dev/issue/170): Parsing the transport header should
- // ultimately be moved into the stack.Check codepath as matchers are
- // added.
- var udpHeader header.UDP
- if pkt.TransportHeader != nil {
- udpHeader = header.UDP(pkt.TransportHeader)
- } else {
- var length int
- if hook == stack.Prerouting {
- // The network header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
- if !ok {
- // There's no valid UDP header here, so we hotdrop the
- // packet.
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
return false, true
}
- h := header.IPv4(hdr)
- pkt.NetworkHeader = hdr
- length = int(h.HeaderLength())
+ return false, false
}
- // The UDP header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize)
- if !ok {
- // There's no valid UDP header here, so we hotdrop the
- // packet.
- return false, true
+
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
}
- udpHeader = header.UDP(hdr[length:])
+
+ default:
+ // We don't know the network protocol.
+ return false, false
+ }
+
+ udpHeader := header.UDP(pkt.TransportHeader().View())
+ if len(udpHeader) < header.UDPMinimumSize {
+ // There's no valid UDP header here, so we drop the packet immediately.
+ return false, true
}
// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 7212d8644..1f926aa91 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -16,10 +16,13 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/context",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 0d45e5053..31e374833 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int
}
d := socket.NewDirent(t, netlinkSocketDevice)
- defer d.DecRef()
+ defer d.DecRef(t)
return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil
}
diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go
index bb205be0d..f061c5d62 100644
--- a/pkg/sentry/socket/netlink/provider_vfs2.go
+++ b/pkg/sentry/socket/netlink/provider_vfs2.go
@@ -51,7 +51,8 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol
vfsfd := &s.vfsfd
mnt := t.Kernel().SocketMount()
- d := sockfs.NewDentry(t.Credentials(), mnt)
+ d := sockfs.NewDentry(t, mnt)
+ defer d.DecRef(t)
if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
DenyPWrite: true,
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index c84d8bd7c..22216158e 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -36,9 +36,9 @@ type commandKind int
const (
kindNew commandKind = 0x0
- kindDel = 0x1
- kindGet = 0x2
- kindSet = 0x3
+ kindDel commandKind = 0x1
+ kindGet commandKind = 0x2
+ kindSet commandKind = 0x3
)
func typeKind(typ uint16) commandKind {
@@ -423,6 +423,11 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin
}
attrs = rest
+ // NOTE: A netlink message will contain multiple header attributes.
+ // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
+ // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
+ // local interface address. We add the local interface address here
+ // and ignore the IFA_ADDRESS.
switch ahdr.Type {
case linux.IFA_LOCAL:
err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
@@ -439,8 +444,57 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin
} else if err != nil {
return syserr.ErrInvalidArgument
}
+ case linux.IFA_ADDRESS:
+ default:
+ return syserr.ErrNotSupported
+ }
+ }
+ return nil
+}
+
+// delAddr handles RTM_DELADDR requests.
+func (p *Protocol) delAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+ stack := inet.StackFromContext(ctx)
+ if stack == nil {
+ // No network stack.
+ return syserr.ErrProtocolNotSupported
+ }
+
+ var ifa linux.InterfaceAddrMessage
+ attrs, ok := msg.GetData(&ifa)
+ if !ok {
+ return syserr.ErrInvalidArgument
+ }
+
+ for !attrs.Empty() {
+ ahdr, value, rest, ok := attrs.ParseFirst()
+ if !ok {
+ return syserr.ErrInvalidArgument
+ }
+ attrs = rest
+
+ // NOTE: A netlink message will contain multiple header attributes.
+ // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
+ // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
+ // local interface address. We use the local interface address to
+ // remove the address and ignore the IFA_ADDRESS.
+ switch ahdr.Type {
+ case linux.IFA_LOCAL:
+ err := stack.RemoveInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+ Family: ifa.Family,
+ PrefixLen: ifa.PrefixLen,
+ Flags: ifa.Flags,
+ Addr: value,
+ })
+ if err != nil {
+ return syserr.ErrInvalidArgument
+ }
+ case linux.IFA_ADDRESS:
+ default:
+ return syserr.ErrNotSupported
}
}
+
return nil
}
@@ -485,6 +539,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms
return p.dumpRoutes(ctx, msg, ms)
case linux.RTM_NEWADDR:
return p.newAddr(ctx, msg, ms)
+ case linux.RTM_DELADDR:
+ return p.delAddr(ctx, msg, ms)
default:
return syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 81f34c5a2..3baad098b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,11 +16,14 @@
package netlink
import (
+ "io"
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -138,14 +141,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke
// Bind the endpoint for good measure so we can connect to it. The
// bound address will never be exposed.
if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
// Create a connection from which the kernel can write messages.
connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
@@ -162,9 +165,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
- s.connection.Release()
- s.ep.Close()
+func (s *socketOpsCommon) Release(ctx context.Context) {
+ s.connection.Release(ctx)
+ s.ep.Close(ctx)
if s.bound {
s.ports.Release(s.protocol.Protocol(), s.portID)
@@ -330,7 +333,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
switch name {
@@ -340,24 +343,26 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
}
s.mu.Lock()
defer s.mu.Unlock()
- return int32(s.sendBufferSize), nil
+ sendBufferSizeP := primitive.Int32(s.sendBufferSize)
+ return &sendBufferSizeP, nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
// We don't have limit on receiving size.
- return int32(math.MaxInt32), nil
+ recvBufferSizeP := primitive.Int32(math.MaxInt32)
+ return &recvBufferSizeP, nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- var passcred int32
+ var passcred primitive.Int32
if s.Passcred() {
passcred = 1
}
- return passcred, nil
+ return &passcred, nil
default:
socket.GetSockOptEmitUnimplementedEvent(t, name)
@@ -617,7 +622,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys
if len(bufs) > 0 {
// RecvMsg never receives the address, so we don't need to send
// one.
- _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{})
+ _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{})
// If the buffer is full, we simply drop messages, just like
// Linux.
if err != nil && err != syserr.ErrWouldBlock {
@@ -644,7 +649,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys
// Add the dump_done_errno payload.
m.Put(int64(0))
- _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
+ _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
if err != nil && err != syserr.ErrWouldBlock {
return err
}
@@ -744,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t
buf := make([]byte, src.NumBytes())
n, err := src.CopyIn(ctx, buf)
+ // io.EOF can be only returned if src is a file, this means that
+ // sendMsg is called from splice and the error has to be ignored in
+ // this case.
+ if err == io.EOF {
+ err = nil
+ }
if err != nil {
// Don't partially consume messages.
return 0, syserr.FromError(err)
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index b854bf990..461d524e5 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -36,10 +37,13 @@ import (
// to/from the kernel.
//
// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer.
+//
+// +stateify savable
type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
socketOpsCommon
}
@@ -55,18 +59,18 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
// Bind the endpoint for good measure so we can connect to it. The
// bound address will never be exposed.
if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
// Create a connection from which the kernel can write messages.
connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
- return &SocketVFS2{
+ fd := &SocketVFS2{
socketOpsCommon: socketOpsCommon{
ports: t.Kernel().NetlinkPorts(),
protocol: protocol,
@@ -75,7 +79,16 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
connection: connection,
sendBufferSize: defaultSendBufferSize,
},
- }, nil
+ }
+ fd.LockFD.Init(&vfs.FileLocks{})
+ return fd, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
}
// Readiness implements waiter.Waitable.Readiness.
@@ -136,3 +149,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
return int64(n), err.ToError()
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 333e0042e..fae3b6783 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -22,12 +22,15 @@ go_library(
"//pkg/binary",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
@@ -50,5 +53,6 @@ go_library(
"//pkg/tcpip/transport/udp",
"//pkg/usermem",
"//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 60df51dae..86c634715 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -26,6 +26,7 @@ package netstack
import (
"bytes"
+ "fmt"
"io"
"math"
"reflect"
@@ -33,11 +34,14 @@ import (
"syscall"
"time"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -154,6 +158,9 @@ var Metrics = tcpip.Stats{
OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+ IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."),
+ IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
+ IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
},
TCP: tcpip.TCPStats{
ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
@@ -190,6 +197,7 @@ var Metrics = tcpip.Stats{
MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
+ ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
},
}
@@ -230,7 +238,7 @@ type commonEndpoint interface {
// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
// transport.Endpoint.SetSockOpt.
- SetSockOpt(interface{}) *tcpip.Error
+ SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error
// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
// transport.Endpoint.SetSockOptBool.
@@ -242,7 +250,7 @@ type commonEndpoint interface {
// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
// transport.Endpoint.GetSockOpt.
- GetSockOpt(interface{}) *tcpip.Error
+ GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error
// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
// transport.Endpoint.GetSockOpt.
@@ -251,6 +259,9 @@ type commonEndpoint interface {
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
// transport.Endpoint.GetSockOpt.
GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
+
+ // LastError implements tcpip.Endpoint.LastError.
+ LastError() *tcpip.Error
}
// LINT.IfChange
@@ -294,8 +305,9 @@ type socketOpsCommon struct {
readView buffer.View
// readCM holds control message information for the last packet read
// from Endpoint.
- readCM tcpip.ControlMessages
- sender tcpip.FullAddress
+ readCM tcpip.ControlMessages
+ sender tcpip.FullAddress
+ linkPacketInfo tcpip.LinkPacketInfo
// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
// of returned messages can be returned via control messages. When
@@ -323,7 +335,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue
}
dirent := socket.NewDirent(t, netstackDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(t)
return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
socketOpsCommon: socketOpsCommon{
Queue: queue,
@@ -416,7 +428,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
- // TODO(b/129292371): Return protocol too.
+ // TODO(gvisor.dev/issue/173): Return protocol too.
return tcpip.FullAddress{
NIC: tcpip.NICID(a.InterfaceIndex),
Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
@@ -444,8 +456,21 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
}
s.readView = nil
s.sender = tcpip.FullAddress{}
+ s.linkPacketInfo = tcpip.LinkPacketInfo{}
+
+ var v buffer.View
+ var cms tcpip.ControlMessages
+ var err *tcpip.Error
- v, cms, err := s.Endpoint.Read(&s.sender)
+ switch e := s.Endpoint.(type) {
+ // The ordering of these interfaces matters. The most specific
+ // interfaces must be specified before the more generic Endpoint
+ // interface.
+ case tcpip.PacketEndpoint:
+ v, cms, err = e.ReadPacket(&s.sender, &s.linkPacketInfo)
+ case tcpip.Endpoint:
+ v, cms, err = e.Read(&s.sender)
+ }
if err != nil {
atomic.StoreUint32(&s.readViewHasData, 0)
return syserr.TranslateNetstackError(err)
@@ -459,8 +484,35 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
+func (s *socketOpsCommon) Release(ctx context.Context) {
+ e, ch := waiter.NewChannelEntry(nil)
+ s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
+ defer s.EventUnregister(&e)
+
s.Endpoint.Close()
+
+ // SO_LINGER option is valid only for TCP. For other socket types
+ // return after endpoint close.
+ if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
+ return
+ }
+
+ var v tcpip.LingerOption
+ if err := s.Endpoint.GetSockOpt(&v); err != nil {
+ return
+ }
+
+ // The case for zero timeout is handled in tcp endpoint close function.
+ // Close is blocked until either:
+ // 1. The endpoint state is not in any of the states: FIN-WAIT1,
+ // CLOSING and LAST_ACK.
+ // 2. Timeout is reached.
+ if v.Enabled && v.Timeout != 0 {
+ t := kernel.TaskFromContext(ctx)
+ start := t.Kernel().MonotonicClock().Now()
+ deadline := start.Add(v.Timeout)
+ t.BlockWithDeadline(ch, true, deadline)
+ }
}
// Read implements fs.FileOperations.Read.
@@ -535,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
}
v := buffer.NewView(size)
if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+ // EOF can be returned only if src is a file and this means it
+ // is in a splice syscall and the error has to be ignored.
+ if err == io.EOF {
+ return v, nil
+ }
return nil, tcpip.ErrBadAddress
}
return v, nil
@@ -719,6 +776,14 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
defer s.EventUnregister(&e)
if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+ if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
+ // TCP unlike UDP returns EADDRNOTAVAIL when it can't
+ // find an available local ephemeral port.
+ if err == tcpip.ErrNoPortAvailable {
+ return syserr.ErrAddressNotAvailable
+ }
+ }
+
return syserr.TranslateNetstackError(err)
}
@@ -775,7 +840,20 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
}
// Issue the bind request to the endpoint.
- return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
+ err := s.Endpoint.Bind(addr)
+ if err == tcpip.ErrNoPortAvailable {
+ // Bind always returns EADDRINUSE irrespective of if the specified port was
+ // already bound or if an ephemeral port was requested but none were
+ // available.
+ //
+ // tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
+ // UDP connect returns EAGAIN on ephemeral port exhaustion.
+ //
+ // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
+ err = tcpip.ErrPortInUse
+ }
+
+ return syserr.TranslateNetstackError(err)
}
// Listen implements the linux syscall listen(2) for sockets backed by
@@ -786,7 +864,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
@@ -795,7 +873,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
// Try to accept the connection again; if it fails, then wait until we
// get a notification.
for {
- if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+ if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock {
return ep, wq, syserr.TranslateNetstackError(err)
}
@@ -808,15 +886,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, wq, terr := s.Endpoint.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, wq, terr := s.Endpoint.Accept(peerAddr)
if terr != nil {
if terr != tcpip.ErrWouldBlock || !blocking {
return 0, nil, 0, syserr.TranslateNetstackError(terr)
}
var err *syserr.Error
- ep, wq, err = s.blockingAccept(t)
+ ep, wq, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -826,7 +907,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
flags := ns.Flags()
@@ -836,13 +917,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer and write it to peer slice.
- var err *syserr.Error
- addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = ConvertAddress(s.family, *peerAddr)
}
fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -884,7 +960,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketOperations rather than
// commonEndpoint. commonEndpoint should be extended to support socket
@@ -894,68 +970,33 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptTimestamp {
val = 1
}
- return val, nil
+ return &val, nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptInq {
val = 1
}
- return val, nil
+ return &val, nil
}
- if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
- switch name {
- case linux.IPT_SO_GET_INFO:
- if outLen < linux.SizeOfIPTGetinfo {
- return nil, syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return nil, syserr.ErrNoDevice
- }
- info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
- if err != nil {
- return nil, err
- }
- return info, nil
-
- case linux.IPT_SO_GET_ENTRIES:
- if outLen < linux.SizeOfIPTGetEntries {
- return nil, syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return nil, syserr.ErrNoDevice
- }
- entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
- if err != nil {
- return nil, err
- }
- return entries, nil
-
- }
- }
-
- return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+ return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
}
// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
// sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -964,10 +1005,10 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
return getSockOptTCP(t, ep, name, outLen)
case linux.SOL_IPV6:
- return getSockOptIPv6(t, ep, name, outLen)
+ return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
case linux.SOL_IP:
- return getSockOptIP(t, ep, name, outLen, family)
+ return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
case linux.SOL_UDP,
linux.SOL_ICMPV6,
@@ -988,7 +1029,7 @@ func boolToInt32(v bool) int32 {
}
// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
switch name {
case linux.SO_ERROR:
@@ -997,11 +1038,14 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
}
// Get the last error and convert it.
- err := ep.GetSockOpt(tcpip.ErrorOption{})
+ err := ep.LastError()
if err == nil {
- return int32(0), nil
+ optP := primitive.Int32(0)
+ return &optP, nil
}
- return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
+
+ optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number())
+ return &optP, nil
case linux.SO_PEERCRED:
if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
@@ -1009,11 +1053,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
}
tcred := t.Credentials()
- return syscall.Ucred{
- Pid: int32(t.ThreadGroup().ID()),
- Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
- Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
- }, nil
+ creds := linux.ControlMessageCredentials{
+ PID: int32(t.ThreadGroup().ID()),
+ UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+ GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+ }
+ return &creds, nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
@@ -1024,7 +1069,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_SNDBUF:
if outLen < sizeOfInt32 {
@@ -1040,7 +1087,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
size = math.MaxInt32
}
- return int32(size), nil
+ sizeP := primitive.Int32(size)
+ return &sizeP, nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
@@ -1056,7 +1104,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
size = math.MaxInt32
}
- return int32(size), nil
+ sizeP := primitive.Int32(size)
+ return &sizeP, nil
case linux.SO_REUSEADDR:
if outLen < sizeOfInt32 {
@@ -1067,7 +1116,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_REUSEPORT:
if outLen < sizeOfInt32 {
@@ -1078,7 +1128,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_BINDTODEVICE:
var v tcpip.BindToDeviceOption
@@ -1086,7 +1138,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.TranslateNetstackError(err)
}
if v == 0 {
- return []byte{}, nil
+ var b primitive.ByteSlice
+ return &b, nil
}
if outLen < linux.IFNAMSIZ {
return nil, syserr.ErrInvalidArgument
@@ -1101,7 +1154,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
// interface was removed.
return nil, syserr.ErrUnknownDevice
}
- return append([]byte(nic.Name), 0), nil
+
+ name := primitive.ByteSlice(append([]byte(nic.Name), 0))
+ return &name, nil
case linux.SO_BROADCAST:
if outLen < sizeOfInt32 {
@@ -1112,7 +1167,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_KEEPALIVE:
if outLen < sizeOfInt32 {
@@ -1123,13 +1180,26 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_LINGER:
if outLen < linux.SizeOfLinger {
return nil, syserr.ErrInvalidArgument
}
- return linux.Linger{}, nil
+
+ var v tcpip.LingerOption
+ var linger linux.Linger
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ if v.Enabled {
+ linger.OnOff = 1
+ }
+ linger.Linger = int32(v.Timeout.Seconds())
+ return &linger, nil
case linux.SO_SNDTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1137,7 +1207,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.ErrInvalidArgument
}
- return linux.NsecToTimeval(s.SendTimeout()), nil
+ sendTimeout := linux.NsecToTimeval(s.SendTimeout())
+ return &sendTimeout, nil
case linux.SO_RCVTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1145,7 +1216,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.ErrInvalidArgument
}
- return linux.NsecToTimeval(s.RecvTimeout()), nil
+ recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
+ return &recvTimeout, nil
case linux.SO_OOBINLINE:
if outLen < sizeOfInt32 {
@@ -1157,7 +1229,32 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.TranslateNetstackError(err)
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
+
+ case linux.SO_NO_CHECK:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptBool(tcpip.NoChecksumOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
+
+ case linux.SO_ACCEPTCONN:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptBool(tcpip.AcceptConnOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
default:
socket.GetSockOptEmitUnimplementedEvent(t, name)
@@ -1166,7 +1263,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
}
// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
-func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.TCP_NODELAY:
if outLen < sizeOfInt32 {
@@ -1177,7 +1274,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(!v), nil
+
+ vP := primitive.Int32(boolToInt32(!v))
+ return &vP, nil
case linux.TCP_CORK:
if outLen < sizeOfInt32 {
@@ -1188,7 +1287,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.TCP_QUICKACK:
if outLen < sizeOfInt32 {
@@ -1199,7 +1300,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.TCP_MAXSEG:
if outLen < sizeOfInt32 {
@@ -1210,8 +1313,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_KEEPIDLE:
if outLen < sizeOfInt32 {
@@ -1222,8 +1325,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(time.Duration(v) / time.Second), nil
+ keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
+ return &keepAliveIdle, nil
case linux.TCP_KEEPINTVL:
if outLen < sizeOfInt32 {
@@ -1234,8 +1337,20 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
+ keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
+ return &keepAliveInterval, nil
- return int32(time.Duration(v) / time.Second), nil
+ case linux.TCP_KEEPCNT:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_USER_TIMEOUT:
if outLen < sizeOfInt32 {
@@ -1246,8 +1361,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(time.Duration(v) / time.Millisecond), nil
+ tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
+ return &tcpUserTimeout, nil
case linux.TCP_INFO:
var v tcpip.TCPInfoOption
@@ -1260,12 +1375,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
info := linux.TCPInfo{}
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, &info)
- if len(ib) > outLen {
- ib = ib[:outLen]
+ buf := t.CopyScratchBuffer(info.SizeBytes())
+ info.MarshalUnsafe(buf)
+ if len(buf) > outLen {
+ buf = buf[:outLen]
}
-
- return ib, nil
+ bufP := primitive.ByteSlice(buf)
+ return &bufP, nil
case linux.TCP_CC_INFO,
linux.TCP_NOTSENT_LOWAT,
@@ -1295,7 +1411,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
}
b := make([]byte, toCopy)
copy(b, v)
- return b, nil
+
+ bP := primitive.ByteSlice(b)
+ return &bP, nil
case linux.TCP_LINGER2:
if outLen < sizeOfInt32 {
@@ -1306,8 +1424,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(time.Duration(v) / time.Second), nil
+ var lingerTimeout primitive.Int32
+ if v >= 0 {
+ lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
+ } else {
+ lingerTimeout = -1
+ }
+ return &lingerTimeout, nil
case linux.TCP_DEFER_ACCEPT:
if outLen < sizeOfInt32 {
@@ -1319,7 +1442,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return nil, syserr.TranslateNetstackError(err)
}
- return int32(time.Duration(v) / time.Second), nil
+ tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
+ return &tcpDeferAccept, nil
case linux.TCP_SYNCNT:
if outLen < sizeOfInt32 {
@@ -1330,8 +1454,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_WINDOW_CLAMP:
if outLen < sizeOfInt32 {
@@ -1342,8 +1466,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
default:
emitUnimplementedEventTCP(t, name)
}
@@ -1351,7 +1475,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
}
// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.IPV6_V6ONLY:
if outLen < sizeOfInt32 {
@@ -1362,7 +1486,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IPV6_PATHMTU:
t.Kernel().EmitUnimplementedEvent(t)
@@ -1370,21 +1496,24 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
case linux.IPV6_TCLASS:
// Length handling for parity with Linux.
if outLen == 0 {
- return make([]byte, 0), nil
+ var b primitive.ByteSlice
+ return &b, nil
}
v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- uintv := uint32(v)
+ uintv := primitive.Uint32(v)
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, &uintv)
+ ib := t.CopyScratchBuffer(uintv.SizeBytes())
+ uintv.MarshalUnsafe(ib)
// Handle cases where outLen is lesser than sizeOfInt32.
if len(ib) > outLen {
ib = ib[:outLen]
}
- return ib, nil
+ ibP := primitive.ByteSlice(ib)
+ return &ibP, nil
case linux.IPV6_RECVTCLASS:
if outLen < sizeOfInt32 {
@@ -1395,7 +1524,82 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
+
+ case linux.IP6T_ORIGINAL_DST:
+ if outLen < int(binary.Size(linux.SockAddrInet6{})) {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.OriginalDestinationOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
+ return a.(*linux.SockAddrInet6), nil
+
+ case linux.IP6T_SO_GET_INFO:
+ if outLen < linux.SizeOfIPTGetinfo {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv6 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
+ if err != nil {
+ return nil, err
+ }
+ return &info, nil
+
+ case linux.IP6T_SO_GET_ENTRIES:
+ // IPTGetEntries is reused for IPv6.
+ if outLen < linux.SizeOfIPTGetEntries {
+ return nil, syserr.ErrInvalidArgument
+ }
+ // Only valid for raw IPv6 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
+ if err != nil {
+ return nil, err
+ }
+ return &entries, nil
+
+ case linux.IP6T_SO_GET_REVISION_TARGET:
+ if outLen < linux.SizeOfXTGetRevision {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv6 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return &ret, nil
default:
emitUnimplementedEventIPv6(t, name)
@@ -1404,7 +1608,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
}
// getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.IP_TTL:
if outLen < sizeOfInt32 {
@@ -1417,11 +1621,12 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
}
// Fill in the default value, if needed.
- if v == 0 {
- v = DefaultTTL
+ vP := primitive.Int32(v)
+ if vP == 0 {
+ vP = DefaultTTL
}
- return int32(v), nil
+ return &vP, nil
case linux.IP_MULTICAST_TTL:
if outLen < sizeOfInt32 {
@@ -1433,7 +1638,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
return nil, syserr.TranslateNetstackError(err)
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.IP_MULTICAST_IF:
if outLen < len(linux.InetAddr{}) {
@@ -1447,7 +1653,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
- return a.(*linux.SockAddrInet).Addr, nil
+ return &a.(*linux.SockAddrInet).Addr, nil
case linux.IP_MULTICAST_LOOP:
if outLen < sizeOfInt32 {
@@ -1458,21 +1664,26 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IP_TOS:
// Length handling for parity with Linux.
if outLen == 0 {
- return []byte(nil), nil
+ var b primitive.ByteSlice
+ return &b, nil
}
v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if outLen < sizeOfInt32 {
- return uint8(v), nil
+ vP := primitive.Uint8(v)
+ return &vP, nil
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.IP_RECVTOS:
if outLen < sizeOfInt32 {
@@ -1483,7 +1694,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IP_PKTINFO:
if outLen < sizeOfInt32 {
@@ -1494,7 +1707,82 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
+
+ case linux.SO_ORIGINAL_DST:
+ if outLen < int(binary.Size(linux.SockAddrInet{})) {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.OriginalDestinationOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
+ return a.(*linux.SockAddrInet), nil
+
+ case linux.IPT_SO_GET_INFO:
+ if outLen < linux.SizeOfIPTGetinfo {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv4 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
+ if err != nil {
+ return nil, err
+ }
+ return &info, nil
+
+ case linux.IPT_SO_GET_ENTRIES:
+ if outLen < linux.SizeOfIPTGetEntries {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv4 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
+ if err != nil {
+ return nil, err
+ }
+ return &entries, nil
+
+ case linux.IPT_SO_GET_REVISION_TARGET:
+ if outLen < linux.SizeOfXTGetRevision {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv4 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+ return nil, syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return &ret, nil
default:
emitUnimplementedEventIP(t, name)
@@ -1529,26 +1817,6 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
return nil
}
- if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
- switch name {
- case linux.IPT_SO_SET_REPLACE:
- if len(optVal) < linux.SizeOfIPTReplace {
- return syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return syserr.ErrNoDevice
- }
- // Stack must be a netstack stack.
- return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
- case linux.IPT_SO_SET_ADD_COUNTERS:
- // TODO(gvisor.dev/issue/170): Counter support.
- return nil
- }
- }
-
return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}
@@ -1563,21 +1831,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
return setSockOptTCP(t, ep, name, optVal)
case linux.SOL_IPV6:
- return setSockOptIPv6(t, ep, name, optVal)
+ return setSockOptIPv6(t, s, ep, name, optVal)
case linux.SOL_IP:
- return setSockOptIP(t, ep, name, optVal)
+ return setSockOptIP(t, s, ep, name, optVal)
+
+ case linux.SOL_PACKET:
+ // gVisor doesn't support any SOL_PACKET options just return not
+ // supported. Returning nil here will result in tcpdump thinking AF_PACKET
+ // features are supported and proceed to use them and break.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return syserr.ErrProtocolNotAvailable
case linux.SOL_UDP,
linux.SOL_ICMPV6,
- linux.SOL_RAW,
- linux.SOL_PACKET:
+ linux.SOL_RAW:
t.Kernel().EmitUnimplementedEvent(t)
}
- // Default to the old behavior; hand off to network stack.
- return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+ return nil
}
// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
@@ -1622,7 +1895,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
}
name := string(optVal[:n])
if name == "" {
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+ v := tcpip.BindToDeviceOption(0)
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
}
s := t.NetworkContext()
if s == nil {
@@ -1630,7 +1904,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
}
for nicID, nic := range s.Interfaces() {
if nic.Name == name {
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+ v := tcpip.BindToDeviceOption(nicID)
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
}
}
return syserr.ErrUnknownDevice
@@ -1696,7 +1971,16 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+ opt := tcpip.OutOfBandInlineOption(v)
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
+
+ case linux.SO_NO_CHECK:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+
+ v := usermem.ByteOrder.Uint32(optVal)
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0))
case linux.SO_LINGER:
if len(optVal) < linux.SizeOfLinger {
@@ -1710,14 +1994,21 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
- return nil
+ return syserr.TranslateNetstackError(
+ ep.SetSockOpt(&tcpip.LingerOption{
+ Enabled: v.OnOff != 0,
+ Timeout: time.Second * time.Duration(v.Linger)}))
+
+ case linux.SO_DETACH_FILTER:
+ // optval is ignored.
+ var v tcpip.SocketDetachFilterOption
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
default:
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
- // Default to the old behavior; hand off to network stack.
- return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+ return nil
}
// setSockOptTCP implements SetSockOpt when level is SOL_TCP.
@@ -1764,7 +2055,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
return syserr.ErrInvalidArgument
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
+ opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
case linux.TCP_KEEPINTVL:
if len(optVal) < sizeOfInt32 {
@@ -1775,7 +2067,19 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
return syserr.ErrInvalidArgument
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+ opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
+
+ case linux.TCP_KEEPCNT:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+
+ v := usermem.ByteOrder.Uint32(optVal)
+ if v < 1 || v > linux.MAX_TCP_KEEPCNT {
+ return syserr.ErrInvalidArgument
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
case linux.TCP_USER_TIMEOUT:
if len(optVal) < sizeOfInt32 {
@@ -1786,11 +2090,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
if v < 0 {
return syserr.ErrInvalidArgument
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+ opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
case linux.TCP_CONGESTION:
v := tcpip.CongestionControlOption(optVal)
- if err := ep.SetSockOpt(v); err != nil {
+ if err := ep.SetSockOpt(&v); err != nil {
return syserr.TranslateNetstackError(err)
}
return nil
@@ -1800,8 +2105,9 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
return syserr.ErrInvalidArgument
}
- v := usermem.ByteOrder.Uint32(optVal)
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+ v := int32(usermem.ByteOrder.Uint32(optVal))
+ opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
case linux.TCP_DEFER_ACCEPT:
if len(optVal) < sizeOfInt32 {
@@ -1811,7 +2117,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
if v < 0 {
v = 0
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+ opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
case linux.TCP_SYNCNT:
if len(optVal) < sizeOfInt32 {
@@ -1836,12 +2143,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
emitUnimplementedEventTCP(t, name)
}
- // Default to the old behavior; hand off to network stack.
- return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+ return nil
}
// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
-func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.IPV6_V6ONLY:
if len(optVal) < sizeOfInt32 {
@@ -1890,12 +2196,32 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+ case linux.IP6T_SO_SET_REPLACE:
+ if len(optVal) < linux.SizeOfIP6TReplace {
+ return syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv6 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+ return syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return syserr.ErrNoDevice
+ }
+ // Stack must be a netstack stack.
+ return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true)
+
+ case linux.IP6T_SO_SET_ADD_COUNTERS:
+ // TODO(gvisor.dev/issue/170): Counter support.
+ return nil
+
default:
emitUnimplementedEventIPv6(t, name)
}
- // Default to the old behavior; hand off to network stack.
- return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+ return nil
}
var (
@@ -1950,7 +2276,7 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
}
// setSockOptIP implements SetSockOpt when level is SOL_IP.
-func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
switch name {
case linux.IP_MULTICAST_TTL:
v, err := parseIntOrChar(optVal)
@@ -1973,7 +2299,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
return err
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
NIC: tcpip.NICID(req.InterfaceIndex),
// TODO(igudger): Change AddMembership to use the standard
// any address representation.
@@ -1987,7 +2313,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
return err
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
NIC: tcpip.NICID(req.InterfaceIndex),
// TODO(igudger): Change DropMembership to use the standard
// any address representation.
@@ -2001,7 +2327,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
return err
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
+ return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
NIC: tcpip.NICID(req.InterfaceIndex),
InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
}))
@@ -2060,13 +2386,43 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
}
return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+ case linux.IP_HDRINCL:
+ if len(optVal) == 0 {
+ return nil
+ }
+ v, err := parseIntOrChar(optVal)
+ if err != nil {
+ return err
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
+
+ case linux.IPT_SO_SET_REPLACE:
+ if len(optVal) < linux.SizeOfIPTReplace {
+ return syserr.ErrInvalidArgument
+ }
+
+ // Only valid for raw IPv4 sockets.
+ if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+ return syserr.ErrProtocolNotAvailable
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return syserr.ErrNoDevice
+ }
+ // Stack must be a netstack stack.
+ return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false)
+
+ case linux.IPT_SO_SET_ADD_COUNTERS:
+ // TODO(gvisor.dev/issue/170): Counter support.
+ return nil
+
case linux.IP_ADD_SOURCE_MEMBERSHIP,
linux.IP_BIND_ADDRESS_NO_PORT,
linux.IP_BLOCK_SOURCE,
linux.IP_CHECKSUM,
linux.IP_DROP_SOURCE_MEMBERSHIP,
linux.IP_FREEBIND,
- linux.IP_HDRINCL,
linux.IP_IPSEC_POLICY,
linux.IP_MINTTL,
linux.IP_MSFILTER,
@@ -2095,8 +2451,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
t.Kernel().EmitUnimplementedEvent(t)
}
- // Default to the old behavior; hand off to network stack.
- return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+ return nil
}
// emitUnimplementedEventTCP emits unimplemented event if name is valid. This
@@ -2106,30 +2461,20 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) {
switch name {
case linux.TCP_CONGESTION,
linux.TCP_CORK,
- linux.TCP_DEFER_ACCEPT,
linux.TCP_FASTOPEN,
linux.TCP_FASTOPEN_CONNECT,
linux.TCP_FASTOPEN_KEY,
linux.TCP_FASTOPEN_NO_COOKIE,
- linux.TCP_KEEPCNT,
- linux.TCP_KEEPIDLE,
- linux.TCP_KEEPINTVL,
- linux.TCP_LINGER2,
- linux.TCP_MAXSEG,
linux.TCP_QUEUE_SEQ,
- linux.TCP_QUICKACK,
linux.TCP_REPAIR,
linux.TCP_REPAIR_QUEUE,
linux.TCP_REPAIR_WINDOW,
linux.TCP_SAVED_SYN,
linux.TCP_SAVE_SYN,
- linux.TCP_SYNCNT,
linux.TCP_THIN_DUPACK,
linux.TCP_THIN_LINEAR_TIMEOUTS,
linux.TCP_TIMESTAMP,
- linux.TCP_ULP,
- linux.TCP_USER_TIMEOUT,
- linux.TCP_WINDOW_CLAMP:
+ linux.TCP_ULP:
t.Kernel().EmitUnimplementedEvent(t)
}
@@ -2291,7 +2636,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
return &out, uint32(sockAddrInet6Size)
case linux.AF_PACKET:
- // TODO(b/129292371): Return protocol too.
+ // TODO(gvisor.dev/issue/173): Return protocol too.
var out linux.SockAddrLink
out.Family = linux.AF_PACKET
out.InterfaceIndex = int32(addr.NIC)
@@ -2397,6 +2742,23 @@ func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
}
+func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
+ switch pktType {
+ case tcpip.PacketHost:
+ return linux.PACKET_HOST
+ case tcpip.PacketOtherHost:
+ return linux.PACKET_OTHERHOST
+ case tcpip.PacketOutgoing:
+ return linux.PACKET_OUTGOING
+ case tcpip.PacketBroadcast:
+ return linux.PACKET_BROADCAST
+ case tcpip.PacketMulticast:
+ return linux.PACKET_MULTICAST
+ default:
+ panic(fmt.Sprintf("unknown packet type: %d", pktType))
+ }
+}
+
// nonBlockingRead issues a non-blocking read.
//
// TODO(b/78348848): Support timestamps for stream sockets.
@@ -2452,6 +2814,11 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq
var addrLen uint32
if isPacket && senderRequested {
addr, addrLen = ConvertAddress(s.family, s.sender)
+ switch v := addr.(type) {
+ case *linux.SockAddrLink:
+ v.Protocol = htons(uint16(s.linkPacketInfo.Protocol))
+ v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType)
+ }
}
if peek {
@@ -2686,11 +3053,16 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
}
func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("ioctl(2) may only be called from a task goroutine")
+ }
+
// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
// sockets.
// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
switch args[1].Int() {
- case syscall.SIOCGSTAMP:
+ case linux.SIOCGSTAMP:
s.readMu.Lock()
defer s.readMu.Unlock()
if !s.timestampValid {
@@ -2698,9 +3070,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
}
tv := linux.NsecToTimeval(s.timestampNS)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := tv.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCINQ:
@@ -2719,9 +3089,8 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
}
@@ -2730,52 +3099,49 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
// Ioctl performs a socket ioctl.
func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("ioctl(2) may only be called from a task goroutine")
+ }
+
switch arg := int(args[1].Int()); arg {
- case syscall.SIOCGIFFLAGS,
- syscall.SIOCGIFADDR,
- syscall.SIOCGIFBRDADDR,
- syscall.SIOCGIFDSTADDR,
- syscall.SIOCGIFHWADDR,
- syscall.SIOCGIFINDEX,
- syscall.SIOCGIFMAP,
- syscall.SIOCGIFMETRIC,
- syscall.SIOCGIFMTU,
- syscall.SIOCGIFNAME,
- syscall.SIOCGIFNETMASK,
- syscall.SIOCGIFTXQLEN:
+ case linux.SIOCGIFFLAGS,
+ linux.SIOCGIFADDR,
+ linux.SIOCGIFBRDADDR,
+ linux.SIOCGIFDSTADDR,
+ linux.SIOCGIFHWADDR,
+ linux.SIOCGIFINDEX,
+ linux.SIOCGIFMAP,
+ linux.SIOCGIFMETRIC,
+ linux.SIOCGIFMTU,
+ linux.SIOCGIFNAME,
+ linux.SIOCGIFNETMASK,
+ linux.SIOCGIFTXQLEN,
+ linux.SIOCETHTOOL:
var ifr linux.IFReq
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
return 0, err
}
if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
return 0, err.ToError()
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := ifr.CopyOut(t, args[2].Pointer())
return 0, err
- case syscall.SIOCGIFCONF:
+ case linux.SIOCGIFCONF:
// Return a list of interface addresses or the buffer size
// necessary to hold the list.
var ifc linux.IFConf
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
return 0, err
}
- if err := ifconfIoctl(ctx, io, &ifc); err != nil {
+ if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
return 0, err
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-
+ _, err := ifc.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCINQ:
@@ -2788,9 +3154,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
v = math.MaxInt32
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCOUTQ:
@@ -2804,9 +3169,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
@@ -2832,7 +3196,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
// identify a device.
- if arg == syscall.SIOCGIFNAME {
+ if arg == linux.SIOCGIFNAME {
// Gets the name of the interface given the interface index
// stored in ifr_ifindex.
index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4]))
@@ -2855,21 +3219,28 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
}
switch arg {
- case syscall.SIOCGIFINDEX:
+ case linux.SIOCGIFINDEX:
// Copy out the index to the data.
usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
- case syscall.SIOCGIFHWADDR:
+ case linux.SIOCGIFHWADDR:
// Copy the hardware address out.
- ifr.Data[0] = 6 // IEEE802.2 arp type.
- ifr.Data[1] = 0
+ //
+ // Refer: https://linux.die.net/man/7/netdevice
+ // SIOCGIFHWADDR, SIOCSIFHWADDR
+ //
+ // Get or set the hardware address of a device using
+ // ifr_hwaddr. The hardware address is specified in a struct
+ // sockaddr. sa_family contains the ARPHRD_* device type,
+ // sa_data the L2 hardware address starting from byte 0. Setting
+ // the hardware address is a privileged operation.
+ usermem.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
n := copy(ifr.Data[2:], iface.Addr)
for i := 2 + n; i < len(ifr.Data); i++ {
ifr.Data[i] = 0 // Clear padding.
}
- usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
- case syscall.SIOCGIFFLAGS:
+ case linux.SIOCGIFFLAGS:
f, err := interfaceStatusFlags(stack, iface.Name)
if err != nil {
return err
@@ -2878,7 +3249,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
// matches Linux behavior.
usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
- case syscall.SIOCGIFADDR:
+ case linux.SIOCGIFADDR:
// Copy the IPv4 address out.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
@@ -2889,32 +3260,32 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
break
}
- case syscall.SIOCGIFMETRIC:
+ case linux.SIOCGIFMETRIC:
// Gets the metric of the device. As per netdevice(7), this
// always just sets ifr_metric to 0.
usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
- case syscall.SIOCGIFMTU:
+ case linux.SIOCGIFMTU:
// Gets the MTU of the device.
usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
- case syscall.SIOCGIFMAP:
+ case linux.SIOCGIFMAP:
// Gets the hardware parameters of the device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFTXQLEN:
+ case linux.SIOCGIFTXQLEN:
// Gets the transmit queue length of the device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFDSTADDR:
+ case linux.SIOCGIFDSTADDR:
// Gets the destination address of a point-to-point device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFBRDADDR:
+ case linux.SIOCGIFBRDADDR:
// Gets the broadcast address of a device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFNETMASK:
+ case linux.SIOCGIFNETMASK:
// Gets the network mask of a device.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
@@ -2931,6 +3302,14 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
break
}
+ case linux.SIOCETHTOOL:
+ // Stubbed out for now, Ideally we should implement the required
+ // sub-commands for ETHTOOL
+ //
+ // See:
+ // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
+ return syserr.ErrEndpointOperation
+
default:
// Not a valid call.
return syserr.ErrInvalidArgument
@@ -2940,7 +3319,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
}
// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
-func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error {
// If Ptr is NULL, return the necessary buffer size via Len.
// Otherwise, write up to Len bytes starting at Ptr containing ifreq
// structs.
@@ -2977,9 +3356,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
// Copy the ifr to userspace.
dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
ifc.Len += int32(linux.SizeOfIFReq)
- if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifr.CopyOut(t, usermem.Addr(dst)); err != nil {
return err
}
}
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index fcd8013c0..b0d9e4d9e 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -18,12 +18,13 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
- "gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
- "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
@@ -34,10 +35,13 @@ import (
// SocketVFS2 encapsulates all the state needed to represent a network stack
// endpoint in the kernel context.
+//
+// +stateify savable
type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
socketOpsCommon
}
@@ -53,7 +57,8 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
}
mnt := t.Kernel().SocketMount()
- d := sockfs.NewDentry(t.Credentials(), mnt)
+ d := sockfs.NewDentry(t, mnt)
+ defer d.DecRef(t)
s := &SocketVFS2{
socketOpsCommon: socketOpsCommon{
@@ -64,6 +69,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
protocol: protocol,
},
}
+ s.LockFD.Init(&vfs.FileLocks{})
vfsfd := &s.vfsfd
if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
@@ -75,6 +81,13 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
return vfsfd, nil
}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.socketOpsCommon.Release(ctx)
+}
+
// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
return s.socketOpsCommon.Readiness(mask)
@@ -147,14 +160,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
// tcpip.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
- ep, wq, terr := s.Endpoint.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, wq, terr := s.Endpoint.Accept(peerAddr)
if terr != nil {
if terr != tcpip.ErrWouldBlock || !blocking {
return 0, nil, 0, syserr.TranslateNetstackError(terr)
}
var err *syserr.Error
- ep, wq, err = s.blockingAccept(t)
+ ep, wq, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -164,7 +181,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
return 0, nil, 0, syserr.FromError(err)
@@ -172,13 +189,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
+ if peerAddr != nil {
// Get address of the peer and write it to peer slice.
- var err *syserr.Error
- addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ addr, addrLen = ConvertAddress(s.family, *peerAddr)
}
fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -197,7 +210,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketVFS2 rather than
// commonEndpoint. commonEndpoint should be extended to support socket
@@ -207,63 +220,28 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptTimestamp {
val = 1
}
- return val, nil
+ return &val, nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptInq {
val = 1
}
- return val, nil
+ return &val, nil
}
- if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
- switch name {
- case linux.IPT_SO_GET_INFO:
- if outLen < linux.SizeOfIPTGetinfo {
- return nil, syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return nil, syserr.ErrNoDevice
- }
- info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
- if err != nil {
- return nil, err
- }
- return info, nil
-
- case linux.IPT_SO_GET_ENTRIES:
- if outLen < linux.SizeOfIPTGetEntries {
- return nil, syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return nil, syserr.ErrNoDevice
- }
- entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
- if err != nil {
- return nil, err
- }
- return entries, nil
-
- }
- }
-
- return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+ return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
}
// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
@@ -293,25 +271,15 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
return nil
}
- if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
- switch name {
- case linux.IPT_SO_SET_REPLACE:
- if len(optVal) < linux.SizeOfIPTReplace {
- return syserr.ErrInvalidArgument
- }
-
- stack := inet.StackFromContext(t)
- if stack == nil {
- return syserr.ErrNoDevice
- }
- // Stack must be a netstack stack.
- return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
- case linux.IPT_SO_SET_ADD_COUNTERS:
- // TODO(gvisor.dev/issue/170): Counter support.
- return nil
- }
- }
-
return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f5fa18136..fa9ac9059 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -15,10 +15,11 @@
package netstack
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/inet"
- "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -41,19 +42,29 @@ func (s *Stack) SupportsIPv6() bool {
return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber)
}
+// Converts Netstack's ARPHardwareType to equivalent linux constants.
+func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 {
+ switch t {
+ case header.ARPHardwareNone:
+ return linux.ARPHRD_NONE
+ case header.ARPHardwareLoopback:
+ return linux.ARPHRD_LOOPBACK
+ case header.ARPHardwareEther:
+ return linux.ARPHRD_ETHER
+ default:
+ panic(fmt.Sprintf("unknown ARPHRD type: %d", t))
+ }
+}
+
// Interfaces implements inet.Stack.Interfaces.
func (s *Stack) Interfaces() map[int32]inet.Interface {
is := make(map[int32]inet.Interface)
for id, ni := range s.Stack.NICInfo() {
- var devType uint16
- if ni.Flags.Loopback {
- devType = linux.ARPHRD_LOOPBACK
- }
is[int32(id)] = inet.Interface{
Name: ni.Name,
Addr: []byte(ni.LinkAddress),
Flags: uint32(nicStateFlagsToLinux(ni.Flags)),
- DeviceType: devType,
+ DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType),
MTU: ni.MTU,
}
}
@@ -89,62 +100,107 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
return nicAddrs
}
-// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
-func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+// convertAddr converts an InterfaceAddr to a ProtocolAddress.
+func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) {
var (
- protocol tcpip.NetworkProtocolNumber
- address tcpip.Address
+ protocol tcpip.NetworkProtocolNumber
+ address tcpip.Address
+ protocolAddress tcpip.ProtocolAddress
)
switch addr.Family {
case linux.AF_INET:
- if len(addr.Addr) < header.IPv4AddressSize {
- return syserror.EINVAL
+ if len(addr.Addr) != header.IPv4AddressSize {
+ return protocolAddress, syserror.EINVAL
}
if addr.PrefixLen > header.IPv4AddressSize*8 {
- return syserror.EINVAL
+ return protocolAddress, syserror.EINVAL
}
protocol = ipv4.ProtocolNumber
- address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
-
+ address = tcpip.Address(addr.Addr)
case linux.AF_INET6:
- if len(addr.Addr) < header.IPv6AddressSize {
- return syserror.EINVAL
+ if len(addr.Addr) != header.IPv6AddressSize {
+ return protocolAddress, syserror.EINVAL
}
if addr.PrefixLen > header.IPv6AddressSize*8 {
- return syserror.EINVAL
+ return protocolAddress, syserror.EINVAL
}
protocol = ipv6.ProtocolNumber
- address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
-
+ address = tcpip.Address(addr.Addr)
default:
- return syserror.ENOTSUP
+ return protocolAddress, syserror.ENOTSUP
}
- protocolAddress := tcpip.ProtocolAddress{
+ protocolAddress = tcpip.ProtocolAddress{
Protocol: protocol,
AddressWithPrefix: tcpip.AddressWithPrefix{
Address: address,
PrefixLen: int(addr.PrefixLen),
},
}
+ return protocolAddress, nil
+}
+
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+ protocolAddress, err := convertAddr(addr)
+ if err != nil {
+ return err
+ }
// Attach address to interface.
- if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+ nicID := tcpip.NICID(idx)
+ if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+ return syserr.TranslateNetstackError(err).ToError()
+ }
+
+ // Add route for local network if it doesn't exist already.
+ localRoute := tcpip.Route{
+ Destination: protocolAddress.AddressWithPrefix.Subnet(),
+ Gateway: "", // No gateway for local network.
+ NIC: nicID,
+ }
+
+ for _, rt := range s.Stack.GetRouteTable() {
+ if rt.Equal(localRoute) {
+ return nil
+ }
+ }
+
+ // Local route does not exist yet. Add it.
+ s.Stack.AddRoute(localRoute)
+
+ return nil
+}
+
+// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr.
+func (s *Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+ protocolAddress, err := convertAddr(addr)
+ if err != nil {
+ return err
+ }
+
+ // Remove addresses matching the address and prefix.
+ nicID := tcpip.NICID(idx)
+ if err := s.Stack.RemoveAddress(nicID, protocolAddress.AddressWithPrefix.Address); err != nil {
return syserr.TranslateNetstackError(err).ToError()
}
- // Add route for local network.
- s.Stack.AddRoute(tcpip.Route{
+ // Remove the corresponding local network route if it exists.
+ localRoute := tcpip.Route{
Destination: protocolAddress.AddressWithPrefix.Subnet(),
Gateway: "", // No gateway for local network.
- NIC: tcpip.NICID(idx),
+ NIC: nicID,
+ }
+ s.Stack.RemoveRoutes(func(rt tcpip.Route) bool {
+ return rt.Equal(localRoute)
})
+
return nil
}
// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
- var rs tcp.ReceiveBufferSizeOption
+ var rs tcpip.TCPReceiveBufferSizeRangeOption
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
return inet.TCPBufferSize{
Min: rs.Min,
@@ -155,17 +211,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
- rs := tcp.ReceiveBufferSizeOption{
+ rs := tcpip.TCPReceiveBufferSizeRangeOption{
Min: size.Min,
Default: size.Default,
Max: size.Max,
}
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
}
// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
- var ss tcp.SendBufferSizeOption
+ var ss tcpip.TCPSendBufferSizeRangeOption
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
return inet.TCPBufferSize{
Min: ss.Min,
@@ -176,24 +232,40 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
- ss := tcp.SendBufferSizeOption{
+ ss := tcpip.TCPSendBufferSizeRangeOption{
Min: size.Min,
Default: size.Default,
Max: size.Max,
}
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
}
// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
func (s *Stack) TCPSACKEnabled() (bool, error) {
- var sack tcp.SACKEnabled
+ var sack tcpip.TCPSACKEnabled
err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
return bool(sack), syserr.TranslateNetstackError(err).ToError()
}
// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
- return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+ opt := tcpip.TCPSACKEnabled(enabled)
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
+}
+
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ var recovery tcpip.TCPRecovery
+ if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
+ return 0, syserr.TranslateNetstackError(err).ToError()
+ }
+ return inet.TCPLossRecovery(recovery), nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+ opt := tcpip.TCPRecovery(recovery)
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
}
// Statistics implements inet.Stack.Statistics.
@@ -314,7 +386,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
udp.PacketsSent.Value(), // OutDatagrams.
udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
0, // Udp/SndbufErrors.
- 0, // Udp/InCsumErrors.
+ udp.ChecksumErrors.Value(), // Udp/InCsumErrors.
0, // Udp/IgnoredMulti.
}
default:
@@ -362,16 +434,10 @@ func (s *Stack) RouteTable() []inet.Route {
}
// IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (stack.IPTables, error) {
+func (s *Stack) IPTables() (*stack.IPTables, error) {
return s.Stack.IPTables(), nil
}
-// FillDefaultIPTables sets the stack's iptables to the default tables, which
-// allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() {
- netfilter.FillDefaultIPTables(s.Stack)
-}
-
// Resume implements inet.Stack.Resume.
func (s *Stack) Resume() {
s.Stack.Resume()
@@ -391,3 +457,24 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
s.Stack.RestoreCleanupEndpoints(es)
}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+ switch protocol {
+ case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+ return s.Stack.Forwarding(protocol)
+ default:
+ panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol))
+ }
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+ switch protocol {
+ case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+ s.Stack.SetForwarding(protocol, enable)
+ default:
+ panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol))
+ }
+ return nil
+}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 6580bd6e9..fd31479e5 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -45,8 +46,8 @@ type ControlMessages struct {
}
// Release releases Unix domain socket credentials and rights.
-func (c *ControlMessages) Release() {
- c.Unix.Release()
+func (c *ControlMessages) Release(ctx context.Context) {
+ c.Unix.Release(ctx)
}
// Socket is an interface combining fs.FileOperations and SocketOps,
@@ -86,7 +87,7 @@ type SocketOps interface {
Shutdown(t *kernel.Task, how int) *syserr.Error
// GetSockOpt implements the getsockopt(2) linux syscall.
- GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error)
+ GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error)
// SetSockOpt implements the setsockopt(2) linux syscall.
SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
@@ -407,7 +408,6 @@ func emitUnimplementedEvent(t *kernel.Task, name int) {
linux.SO_MARK,
linux.SO_MAX_PACING_RATE,
linux.SO_NOFCS,
- linux.SO_NO_CHECK,
linux.SO_OOBINLINE,
linux.SO_PASSCRED,
linux.SO_PASSSEC,
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index de2cc4bdf..cce0acc33 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,12 +1,37 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
+go_template_instance(
+ name = "socket_refs",
+ out = "socket_refs.go",
+ package = "unix",
+ prefix = "socketOperations",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "SocketOperations",
+ },
+)
+
+go_template_instance(
+ name = "socket_vfs2_refs",
+ out = "socket_vfs2_refs.go",
+ package = "unix",
+ prefix = "socketVFS2",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "SocketVFS2",
+ },
+)
+
go_library(
name = "unix",
srcs = [
"device.go",
"io.go",
+ "socket_refs.go",
+ "socket_vfs2_refs.go",
"unix.go",
"unix_vfs2.go",
],
@@ -15,12 +40,16 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/log",
+ "//pkg/marshal",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index c708b6030..3ebbd28b0 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "queue_refs",
+ out = "queue_refs.go",
+ package = "transport",
+ prefix = "queue",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "queue",
+ },
+)
+
go_library(
name = "transport",
srcs = [
@@ -22,6 +33,7 @@ go_library(
"connectioned_state.go",
"connectionless.go",
"queue.go",
+ "queue_refs.go",
"transport_message_list.go",
"unix.go",
],
@@ -32,6 +44,7 @@ go_library(
"//pkg/ilist",
"//pkg/log",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/sync",
"//pkg/syserr",
"//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index ce5b94ee7..aa4f3c04d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
}
q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
- q1.EnableLeakCheck("transport.queue")
+ q1.EnableLeakCheck()
q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
- q2.EnableLeakCheck("transport.queue")
+ q2.EnableLeakCheck()
if stype == linux.SOCK_STREAM {
a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool {
// The socket will be a fresh state after a call to close and may be reused.
// That is, close may be used to "unbind" or "disconnect" the socket in error
// paths.
-func (e *connectionedEndpoint) Close() {
+func (e *connectionedEndpoint) Close(ctx context.Context) {
e.Lock()
var c ConnectedEndpoint
var r Receiver
@@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() {
case e.Listening():
close(e.acceptedChan)
for n := range e.acceptedChan {
- n.Close()
+ n.Close(ctx)
}
e.acceptedChan = nil
e.path = ""
@@ -241,18 +241,18 @@ func (e *connectionedEndpoint) Close() {
e.Unlock()
if c != nil {
c.CloseNotify()
- c.Release()
+ c.Release(ctx)
}
if r != nil {
r.CloseNotify()
- r.Release()
+ r.Release(ctx)
}
}
// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
if ce.Type() != e.stype {
- return syserr.ErrConnectionRefused
+ return syserr.ErrWrongProtocolForSocket
}
// Check if ce is e to avoid a deadlock.
@@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
}
readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
- readQueue.EnableLeakCheck("transport.queue")
+ readQueue.EnableLeakCheck()
ne.connected = &connectedEndpoint{
endpoint: ce,
writeQueue: readQueue,
}
writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
- writeQueue.EnableLeakCheck("transport.queue")
+ writeQueue.EnableLeakCheck()
if e.stype == linux.SOCK_STREAM {
ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
} else {
@@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
return nil
default:
// Busy; return ECONNREFUSED per spec.
- ne.Close()
+ ne.Close(ctx)
e.Unlock()
ce.Unlock()
return syserr.ErrConnectionRefused
@@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
}
// Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
e.Lock()
defer e.Unlock()
@@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
select {
case ne := <-e.acceptedChan:
+ if peerAddr != nil {
+ ne.Lock()
+ c := ne.connected
+ ne.Unlock()
+ if c != nil {
+ addr, err := c.GetLocalAddress()
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ *peerAddr = addr
+ }
+ }
return ne, nil
default:
@@ -476,6 +488,9 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
// State implements socket.Socket.State.
func (e *connectionedEndpoint) State() uint32 {
+ e.Lock()
+ defer e.Unlock()
+
if e.Connected() {
return linux.SS_CONNECTED
}
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 4b06d63ac..f8aacca13 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -42,7 +42,7 @@ var (
func NewConnectionless(ctx context.Context) Endpoint {
ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
- q.EnableLeakCheck("transport.queue")
+ q.EnableLeakCheck()
ep.receiver = &queueReceiver{readQueue: &q}
return ep
}
@@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool {
// Close puts the endpoint in a closed state and frees all resources associated
// with it.
-func (e *connectionlessEndpoint) Close() {
+func (e *connectionlessEndpoint) Close(ctx context.Context) {
e.Lock()
if e.connected != nil {
- e.connected.Release()
+ e.connected.Release(ctx)
e.connected = nil
}
@@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() {
e.Unlock()
r.CloseNotify()
- r.Release()
+ r.Release(ctx)
}
// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
@@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C
if err != nil {
return 0, syserr.ErrInvalidEndpointState
}
- defer connected.Release()
+ defer connected.Release(ctx)
e.Lock()
- n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+ n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
e.Unlock()
if notify {
@@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
e.Lock()
if e.connected != nil {
- e.connected.Release()
+ e.connected.Release(ctx)
}
e.connected = connected
e.Unlock()
@@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
}
// Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+func (*connectionlessEndpoint) Listen(int) *syserr.Error {
return syserr.ErrNotSupported
}
// Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
return nil, syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index d8f3ad63d..342def28f 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -15,7 +15,7 @@
package transport
import (
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -27,7 +27,7 @@ import (
//
// +stateify savable
type queue struct {
- refs.AtomicRefCount
+ queueRefs
ReaderQueue *waiter.Queue
WriterQueue *waiter.Queue
@@ -57,21 +57,23 @@ func (q *queue) Close() {
// Both the read and write queues must be notified after resetting:
// q.ReaderQueue.Notify(waiter.EventIn)
// q.WriterQueue.Notify(waiter.EventOut)
-func (q *queue) Reset() {
+func (q *queue) Reset(ctx context.Context) {
q.mu.Lock()
for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
- cur.Release()
+ cur.Release(ctx)
}
q.dataList.Reset()
q.used = 0
q.mu.Unlock()
}
-// DecRef implements RefCounter.DecRef with destructor q.Reset.
-func (q *queue) DecRef() {
- q.DecRefWithDestructor(q.Reset)
- // We don't need to notify after resetting because no one cares about
- // this queue after all references have been dropped.
+// DecRef implements RefCounter.DecRef.
+func (q *queue) DecRef(ctx context.Context) {
+ q.queueRefs.DecRef(func() {
+ // We don't need to notify after resetting because no one cares about
+ // this queue after all references have been dropped.
+ q.Reset(ctx)
+ })
}
// IsReadable determines if q is currently readable.
@@ -111,7 +113,7 @@ func (q *queue) IsWritable() bool {
//
// If notify is true, ReaderQueue.Notify must be called:
// q.ReaderQueue.Notify(waiter.EventIn)
-func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
+func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
q.mu.Lock()
if q.closed {
@@ -124,7 +126,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress
}
if discardEmpty && l == 0 {
q.mu.Unlock()
- c.Release()
+ c.Release(ctx)
return 0, false, nil
}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2f1b127df..b648273a4 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -32,12 +32,14 @@ import (
const initialLimit = 16 * 1024
// A RightsControlMessage is a control message containing FDs.
+//
+// +stateify savable
type RightsControlMessage interface {
// Clone returns a copy of the RightsControlMessage.
Clone() RightsControlMessage
// Release releases any resources owned by the RightsControlMessage.
- Release()
+ Release(ctx context.Context)
}
// A CredentialsControlMessage is a control message containing Unix credentials.
@@ -74,9 +76,9 @@ func (c *ControlMessages) Clone() ControlMessages {
}
// Release releases both the credentials and the rights.
-func (c *ControlMessages) Release() {
+func (c *ControlMessages) Release(ctx context.Context) {
if c.Rights != nil {
- c.Rights.Release()
+ c.Rights.Release(ctx)
}
*c = ControlMessages{}
}
@@ -90,7 +92,7 @@ type Endpoint interface {
// Close puts the endpoint in a closed state and frees all resources
// associated with it.
- Close()
+ Close(ctx context.Context)
// RecvMsg reads data and a control message from the endpoint. This method
// does not block if there is no data pending.
@@ -151,7 +153,10 @@ type Endpoint interface {
// block if no new connections are available.
//
// The returned Queue is the wait queue for the newly created endpoint.
- Accept() (Endpoint, *syserr.Error)
+ //
+ // peerAddr if not nil will be populated with the address of the connected
+ // peer on a successful accept.
+ Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
// Bind binds the endpoint to a specific local address and port.
// Specifying a NIC is optional.
@@ -172,9 +177,8 @@ type Endpoint interface {
// connected.
GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
- // SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
- // types.
- SetSockOpt(opt interface{}) *tcpip.Error
+ // SetSockOpt sets a socket option.
+ SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error
// SetSockOptBool sets a socket option for simple cases when a value has
// the int type.
@@ -184,9 +188,8 @@ type Endpoint interface {
// the int type.
SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
- // GetSockOpt gets a socket option. opt should be a pointer to one of the
- // tcpip.*Option types.
- GetSockOpt(opt interface{}) *tcpip.Error
+ // GetSockOpt gets a socket option.
+ GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error
// GetSockOptBool gets a socket option for simple cases when a return
// value has the int type.
@@ -199,6 +202,9 @@ type Endpoint interface {
// State returns the current state of the socket, as represented by Linux in
// procfs.
State() uint32
+
+ // LastError implements tcpip.Endpoint.LastError.
+ LastError() *tcpip.Error
}
// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -252,7 +258,7 @@ type BoundEndpoint interface {
// Release releases any resources held by the BoundEndpoint. It must be
// called before dropping all references to a BoundEndpoint returned by a
// function.
- Release()
+ Release(ctx context.Context)
}
// message represents a message passed over a Unix domain socket.
@@ -281,8 +287,8 @@ func (m *message) Length() int64 {
}
// Release releases any resources held by the message.
-func (m *message) Release() {
- m.Control.Release()
+func (m *message) Release(ctx context.Context) {
+ m.Control.Release(ctx)
}
// Peek returns a copy of the message.
@@ -304,7 +310,7 @@ type Receiver interface {
// See Endpoint.RecvMsg for documentation on shared arguments.
//
// notify indicates if RecvNotify should be called.
- Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
+ Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
// RecvNotify notifies the Receiver of a successful Recv. This must not be
// called while holding any endpoint locks.
@@ -332,8 +338,8 @@ type Receiver interface {
RecvMaxQueueSize() int64
// Release releases any resources owned by the Receiver. It should be
- // called before droping all references to a Receiver.
- Release()
+ // called before dropping all references to a Receiver.
+ Release(ctx context.Context)
}
// queueReceiver implements Receiver for datagram sockets.
@@ -344,7 +350,7 @@ type queueReceiver struct {
}
// Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
var m *message
var notify bool
var err *syserr.Error
@@ -398,8 +404,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
}
// Release implements Receiver.Release.
-func (q *queueReceiver) Release() {
- q.readQueue.DecRef()
+func (q *queueReceiver) Release(ctx context.Context) {
+ q.readQueue.DecRef(ctx)
}
// streamQueueReceiver implements Receiver for stream sockets.
@@ -456,7 +462,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
}
// Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -483,7 +489,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
c := q.control.Clone()
// Don't consume data since we are peeking.
- copied, data, _ = vecCopy(data, q.buffer)
+ copied, _, _ = vecCopy(data, q.buffer)
return copied, copied, c, false, q.addr, notify, nil
}
@@ -502,7 +508,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
var cmTruncated bool
if c.Rights != nil && numRights == 0 {
- c.Rights.Release()
+ c.Rights.Release(ctx)
c.Rights = nil
cmTruncated = true
}
@@ -557,7 +563,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
// Consume rights.
if numRights == 0 {
cmTruncated = true
- q.control.Rights.Release()
+ q.control.Rights.Release(ctx)
} else {
c.Rights = q.control.Rights
haveRights = true
@@ -568,6 +574,12 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
return copied, copied, c, cmTruncated, q.addr, notify, nil
}
+// Release implements Receiver.Release.
+func (q *streamQueueReceiver) Release(ctx context.Context) {
+ q.queueReceiver.Release(ctx)
+ q.control.Release(ctx)
+}
+
// A ConnectedEndpoint is an Endpoint that can be used to send Messages.
type ConnectedEndpoint interface {
// Passcred implements Endpoint.Passcred.
@@ -582,7 +594,7 @@ type ConnectedEndpoint interface {
//
// syserr.ErrWouldBlock can be returned along with a partial write if
// the caller should block to send the rest of the data.
- Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
+ Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
// SendNotify notifies the ConnectedEndpoint of a successful Send. This
// must not be called while holding any endpoint locks.
@@ -615,8 +627,8 @@ type ConnectedEndpoint interface {
SendMaxQueueSize() int64
// Release releases any resources owned by the ConnectedEndpoint. It should
- // be called before droping all references to a ConnectedEndpoint.
- Release()
+ // be called before dropping all references to a ConnectedEndpoint.
+ Release(ctx context.Context)
// CloseUnread sets the fact that this end is closed with unread data to
// the peer socket.
@@ -654,7 +666,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
}
// Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
discardEmpty := false
truncate := false
if e.endpoint.Type() == linux.SOCK_STREAM {
@@ -669,7 +681,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu
truncate = true
}
- return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate)
+ return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
}
// SendNotify implements ConnectedEndpoint.SendNotify.
@@ -707,8 +719,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 {
}
// Release implements ConnectedEndpoint.Release.
-func (e *connectedEndpoint) Release() {
- e.writeQueue.DecRef()
+func (e *connectedEndpoint) Release(ctx context.Context) {
+ e.writeQueue.DecRef(ctx)
}
// CloseUnread implements ConnectedEndpoint.CloseUnread.
@@ -742,6 +754,9 @@ type baseEndpoint struct {
// path is not empty if the endpoint has been bound,
// or may be used if the endpoint is connected.
path string
+
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -798,7 +813,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n
return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected
}
- recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
+ recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek)
e.Unlock()
if err != nil {
return 0, 0, ControlMessages{}, false, err
@@ -827,7 +842,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return 0, syserr.ErrAlreadyConnected
}
- n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+ n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
e.Unlock()
if notify {
@@ -837,8 +852,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return n, err
}
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+ switch v := opt.(type) {
+ case *tcpip.LingerOption:
+ e.Lock()
+ e.linger = *v
+ e.Unlock()
+ }
return nil
}
@@ -866,7 +887,7 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
switch opt {
- case tcpip.KeepaliveEnabledOption:
+ case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
return false, nil
case tcpip.PasscredOption:
@@ -940,9 +961,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
- switch opt.(type) {
- case tcpip.ErrorOption:
+func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+ switch o := opt.(type) {
+ case *tcpip.LingerOption:
+ e.Lock()
+ *o = e.linger
+ e.Unlock()
return nil
default:
@@ -951,6 +975,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
}
+// LastError implements Endpoint.LastError.
+func (*baseEndpoint) LastError() *tcpip.Error {
+ return nil
+}
+
// Shutdown closes the read and/or write end of the endpoint connection to its
// peer.
func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
@@ -1001,6 +1030,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
}
// Release implements BoundEndpoint.Release.
-func (*baseEndpoint) Release() {
+func (*baseEndpoint) Release(context.Context) {
// Binding a baseEndpoint doesn't take a reference.
}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b29e9d7f..adad485a9 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -24,7 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -55,13 +55,14 @@ type SocketOperations struct {
fsutil.FileNoopFlush `state:"nosave"`
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ socketOperationsRefs
socketOpsCommon
}
// New creates a new unix socket.
func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
dirent := socket.NewDirent(ctx, unixSocketDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true})
}
@@ -79,34 +80,41 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
stype: stype,
},
}
- s.EnableLeakCheck("unix.SocketOperations")
-
+ s.EnableLeakCheck()
return fs.NewFile(ctx, d, flags, &s)
}
+// DecRef implements RefCounter.DecRef.
+func (s *SocketOperations) DecRef(ctx context.Context) {
+ s.socketOperationsRefs.DecRef(func() {
+ s.ep.Close(ctx)
+ if s.abstractNamespace != nil {
+ s.abstractNamespace.Remove(s.abstractName, s)
+ }
+ })
+}
+
+// Release implemements fs.FileOperations.Release.
+func (s *SocketOperations) Release(ctx context.Context) {
+ // Release only decrements a reference on s because s may be referenced in
+ // the abstract socket namespace.
+ s.DecRef(ctx)
+}
+
// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
//
// +stateify savable
type socketOpsCommon struct {
- refs.AtomicRefCount
socket.SendReceiveTimeout
ep transport.Endpoint
stype linux.SockType
-}
-// DecRef implements RefCounter.DecRef.
-func (s *socketOpsCommon) DecRef() {
- s.DecRefWithDestructor(func() {
- s.ep.Close()
- })
-}
-
-// Release implemements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
- // Release only decrements a reference on s because s may be referenced in
- // the abstract socket namespace.
- s.DecRef()
+ // abstractName and abstractNamespace indicate the name and namespace of the
+ // socket if it is bound to an abstract socket namespace. Once the socket is
+ // bound, they cannot be modified.
+ abstractName string
+ abstractNamespace *kernel.AbstractSocketNamespace
}
func (s *socketOpsCommon) isPacket() bool {
@@ -184,8 +192,8 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
- return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
+ return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
}
// Listen implements the linux syscall listen(2) for sockets backed by
@@ -196,7 +204,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.EventRegister(&e, waiter.EventIn)
@@ -205,7 +213,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Try to accept the connection; if it fails, then wait until we get a
// notification.
for {
- if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+ if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
return ep, err
}
@@ -218,22 +226,25 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, err := s.ep.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, err := s.ep.Accept(peerAddr)
if err != nil {
if err != syserr.ErrWouldBlock || !blocking {
return 0, nil, 0, err
}
var err *syserr.Error
- ep, err = s.blockingAccept(t)
+ ep, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
}
ns := New(t, ep, s.stype)
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
flags := ns.Flags()
@@ -243,13 +254,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer.
- var err *syserr.Error
- addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
}
fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -283,17 +289,21 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if t.IsNetworkNamespaced() {
return syserr.ErrInvalidEndpointState
}
- if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+ asn := t.AbstractSockets()
+ name := p[1:]
+ if err := asn.Bind(t, name, bep, s); err != nil {
// syserr.ErrPortInUse corresponds to EADDRINUSE.
return syserr.ErrPortInUse
}
+ s.abstractName = name
+ s.abstractNamespace = asn
} else {
// The parent and name.
var d *fs.Dirent
var name string
cwd := t.FSContext().WorkingDirectory()
- defer cwd.DecRef()
+ defer cwd.DecRef(t)
// Is there no slash at all?
if !strings.Contains(p, "/") {
@@ -301,7 +311,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
name = p
} else {
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
// Find the last path component, we know that something follows
// that final slash, otherwise extractPath() would have failed.
lastSlash := strings.LastIndex(p, "/")
@@ -317,7 +327,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
// No path available.
return syserr.ErrNoSuchFile
}
- defer d.DecRef()
+ defer d.DecRef(t)
name = p[lastSlash+1:]
}
@@ -331,7 +341,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if err != nil {
return syserr.ErrPortInUse
}
- childDir.DecRef()
+ childDir.DecRef(t)
}
return nil
@@ -377,9 +387,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
FollowFinalSymlink: true,
}
ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path})
- root.DecRef()
+ root.DecRef(t)
if relPath {
- start.DecRef()
+ start.DecRef(t)
}
if e != nil {
return nil, syserr.FromError(e)
@@ -392,15 +402,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
cwd := t.FSContext().WorkingDirectory()
remainingTraversals := uint(fs.DefaultTraversalLimit)
d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
- cwd.DecRef()
- root.DecRef()
+ cwd.DecRef(t)
+ root.DecRef(t)
if e != nil {
return nil, syserr.FromError(e)
}
// Extract the endpoint if one is there.
ep := d.Inode.BoundEndpoint(path)
- d.DecRef()
+ d.DecRef(t)
if ep == nil {
// No socket!
return nil, syserr.ErrConnectionRefused
@@ -414,10 +424,21 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
if err != nil {
return err
}
- defer ep.Release()
+ defer ep.Release(t)
// Connect the server endpoint.
- return s.ep.Connect(t, ep)
+ err = s.ep.Connect(t, ep)
+
+ if err == syserr.ErrWrongProtocolForSocket {
+ // Linux for abstract sockets returns ErrConnectionRefused
+ // instead of ErrWrongProtocolForSocket.
+ path, _ := extractPath(sockaddr)
+ if len(path) > 0 && path[0] == 0 {
+ err = syserr.ErrConnectionRefused
+ }
+ }
+
+ return err
}
// Write implements fs.FileOperations.Write.
@@ -448,15 +469,25 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
To: nil,
}
if len(to) > 0 {
- ep, err := extractEndpoint(t, to)
- if err != nil {
- return 0, err
- }
- defer ep.Release()
- w.To = ep
+ switch s.stype {
+ case linux.SOCK_SEQPACKET:
+ to = nil
+ case linux.SOCK_STREAM:
+ if s.State() == linux.SS_CONNECTED {
+ return 0, syserr.ErrAlreadyConnected
+ }
+ return 0, syserr.ErrNotSupported
+ default:
+ ep, err := extractEndpoint(t, to)
+ if err != nil {
+ return 0, err
+ }
+ defer ep.Release(t)
+ w.To = ep
- if ep.Passcred() && w.Control.Credentials == nil {
- w.Control.Credentials = control.MakeCreds(t)
+ if ep.Passcred() && w.Control.Credentials == nil {
+ w.Control.Credentials = control.MakeCreds(t)
+ }
}
}
@@ -541,13 +572,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 45e109361..7a78444dc 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -18,7 +18,9 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -35,11 +37,15 @@ import (
// SocketVFS2 implements socket.SocketVFS2 (and by extension,
// vfs.FileDescriptionImpl) for Unix sockets.
+//
+// +stateify savable
type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
+ socketVFS2Refs
socketOpsCommon
}
@@ -49,9 +55,10 @@ var _ = socket.SocketVFS2(&SocketVFS2{})
// returns a corresponding file description.
func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
mnt := t.Kernel().SocketMount()
- d := sockfs.NewDentry(t.Credentials(), mnt)
+ d := sockfs.NewDentry(t, mnt)
+ defer d.DecRef(t)
- fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d)
+ fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
if err != nil {
return nil, syserr.FromError(err)
}
@@ -60,7 +67,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
// NewFileDescription creates and returns a socket file description
// corresponding to the given mount and dentry.
-func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) {
+func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
// SOCK_DGRAM and don't require CAP_NET_RAW.
if stype == linux.SOCK_RAW {
@@ -73,6 +80,8 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
stype: stype,
},
}
+ sock.EnableLeakCheck()
+ sock.LockFD.Init(locks)
vfsfd := &sock.vfsfd
if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
@@ -84,15 +93,34 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
return vfsfd, nil
}
+// DecRef implements RefCounter.DecRef.
+func (s *SocketVFS2) DecRef(ctx context.Context) {
+ s.socketVFS2Refs.DecRef(func() {
+ t := kernel.TaskFromContext(ctx)
+ t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+ s.ep.Close(ctx)
+ if s.abstractNamespace != nil {
+ s.abstractNamespace.Remove(s.abstractName, s)
+ }
+ })
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+ // Release only decrements a reference on s because s may be referenced in
+ // the abstract socket namespace.
+ s.DecRef(ctx)
+}
+
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
- return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
+ return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
}
// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
// Register for notifications.
e, ch := waiter.NewChannelEntry(nil)
s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
@@ -101,7 +129,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
// Try to accept the connection; if it fails, then wait until we get a
// notification.
for {
- if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+ if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
return ep, err
}
@@ -114,15 +142,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
- // Issue the accept request to get the new endpoint.
- ep, err := s.ep.Accept()
+ var peerAddr *tcpip.FullAddress
+ if peerRequested {
+ peerAddr = &tcpip.FullAddress{}
+ }
+ ep, err := s.ep.Accept(peerAddr)
if err != nil {
if err != syserr.ErrWouldBlock || !blocking {
return 0, nil, 0, err
}
var err *syserr.Error
- ep, err = s.blockingAccept(t)
+ ep, err = s.blockingAccept(t, peerAddr)
if err != nil {
return 0, nil, 0, err
}
@@ -132,7 +163,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
@@ -140,13 +171,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
var addr linux.SockAddr
var addrLen uint32
- if peerRequested {
- // Get address of the peer.
- var err *syserr.Error
- addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
- if err != nil {
- return 0, nil, 0, err
- }
+ if peerAddr != nil {
+ addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
}
fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -179,19 +205,23 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if t.IsNetworkNamespaced() {
return syserr.ErrInvalidEndpointState
}
- if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+ asn := t.AbstractSockets()
+ name := p[1:]
+ if err := asn.Bind(t, name, bep, s); err != nil {
// syserr.ErrPortInUse corresponds to EADDRINUSE.
return syserr.ErrPortInUse
}
+ s.abstractName = name
+ s.abstractNamespace = asn
} else {
path := fspath.Parse(p)
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
relPath := !path.Absolute
if relPath {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
}
pop := vfs.PathOperation{
Root: root,
@@ -238,13 +268,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
if dst.NumBytes() == 0 {
return 0, nil
}
- return dst.CopyOutFrom(ctx, &EndpointReader{
+ r := &EndpointReader{
Ctx: ctx,
Endpoint: s.ep,
NumRights: 0,
Peek: false,
From: nil,
- })
+ }
+ n, err := dst.CopyOutFrom(ctx, r)
+ // Drop control messages.
+ r.Control.Release(ctx)
+ return n, err
}
// PWrite implements vfs.FileDescriptionImpl.
@@ -297,6 +331,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
}
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
// providerVFS2 is a unix domain socket provider for VFS2.
type providerVFS2 struct{}
@@ -319,7 +363,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int)
f, err := NewSockfsFile(t, ep, stype)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
return f, nil
@@ -343,14 +387,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*
ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
s1, err := NewSockfsFile(t, ep1, stype)
if err != nil {
- ep1.Close()
- ep2.Close()
+ ep1.Close(t)
+ ep2.Close(t)
return nil, nil, err
}
s2, err := NewSockfsFile(t, ep2, stype)
if err != nil {
- s1.DecRef()
- ep2.Close()
+ s1.DecRef(t)
+ ep2.Close(t)
return nil, nil, err
}
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 0ea4aab8b..563d60578 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -12,10 +12,12 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/log",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/time",
+ "//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
"//pkg/state/statefile",
"//pkg/syserror",
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 9eb626b76..167754537 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -19,10 +19,12 @@ import (
"fmt"
"io"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/state/statefile"
"gvisor.dev/gvisor/pkg/syserror"
@@ -57,11 +59,14 @@ type SaveOpts struct {
}
// Save saves the system state.
-func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
+func (opts SaveOpts) Save(ctx context.Context, k *kernel.Kernel, w *watchdog.Watchdog) error {
log.Infof("Sandbox save started, pausing all tasks.")
k.Pause()
- defer k.Unpause()
- defer log.Infof("Tasks resumed after save.")
+ k.ReceiveTaskStates()
+ defer func() {
+ k.Unpause()
+ log.Infof("Tasks resumed after save.")
+ }()
w.Stop()
defer w.Start()
@@ -78,7 +83,7 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
err = ErrStateFile{err}
} else {
// Save the kernel.
- err = k.SaveTo(wc)
+ err = k.SaveTo(ctx, wc)
// ENOSPC is a state file error. This error can only come from
// writing the state file, and not from fs.FileOperations.Fsync
@@ -105,7 +110,7 @@ type LoadOpts struct {
}
// Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) error {
+func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
// Open the file.
r, m, err := statefile.NewReader(opts.Source, opts.Key)
if err != nil {
@@ -115,5 +120,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) er
previousMetadata = m
// Restore the Kernel object graph.
- return k.LoadFrom(r, n, clocks)
+ return k.LoadFrom(ctx, r, n, clocks, vfsOpts)
}
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 88d5db9fc..a920180d3 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -28,6 +28,7 @@ go_library(
"//pkg/binary",
"//pkg/bits",
"//pkg/eventchannel",
+ "//pkg/marshal/primitive",
"//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index a6e48b836..ae3b998c8 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -26,7 +26,7 @@ import (
func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
var e linux.EpollEvent
- if _, err := t.CopyIn(eventAddr, &e); err != nil {
+ if _, err := e.CopyIn(t, eventAddr); err != nil {
return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
}
var sb strings.Builder
@@ -41,7 +41,7 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
addr := eventsAddr
for i := uint64(0); i < numEvents; i++ {
var e linux.EpollEvent
- if _, err := t.CopyIn(addr, &e); err != nil {
+ if _, err := e.CopyIn(t, addr); err != nil {
fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
continue
}
@@ -50,10 +50,10 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
sb.WriteString("...")
break
}
- if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
- fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
- continue
- }
+ // Allowing addr to overflow is consistent with Linux, and harmless; if
+ // this isn't the last iteration of the loop, the next call to CopyIn
+ // will just fail with EFAULT.
+ addr, _ = addr.AddLength(uint64(linux.SizeOfEpollEvent))
}
sb.WriteString("}")
return sb.String()
@@ -75,7 +75,7 @@ var epollEventEvents = abi.FlagSet{
{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
- {Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+ {Flag: linux.EPOLLHUP, Name: "EPOLLHUP"},
{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index c0512de89..cc5f70cd4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
@@ -166,7 +167,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
}
buf := make([]byte, length)
- if _, err := t.CopyIn(addr, &buf); err != nil {
+ if _, err := t.CopyInBytes(addr, buf); err != nil {
return fmt.Sprintf("%#x (error decoding control: %v)", addr, err)
}
@@ -302,7 +303,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string {
var msg slinux.MessageHeader64
- if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err)
}
s := fmt.Sprintf(
@@ -380,9 +381,9 @@ func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) str
func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) {
// socklen_t is 32-bits.
- var l uint32
- _, err := t.CopyIn(addr, &l)
- return l, err
+ var l primitive.Uint32
+ _, err := l.CopyIn(t, addr)
+ return uint32(l), err
}
func sockLenPointer(t *kernel.Task, addr usermem.Addr) string {
@@ -436,22 +437,22 @@ func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, o
func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
switch optLen {
case 1:
- var v uint8
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint8
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
return fmt.Sprintf("%#x {value=%v}", optVal, v)
case 2:
- var v uint16
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint16
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
return fmt.Sprintf("%#x {value=%v}", optVal, v)
case 4:
- var v uint32
- _, err := t.CopyIn(optVal, &v)
+ var v primitive.Uint32
+ _, err := v.CopyIn(t, optVal)
if err != nil {
return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
}
@@ -521,6 +522,7 @@ var sockOptNames = map[uint64]abi.ValueSet{
linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT",
linux.IP_PKTOPTIONS: "IP_PKTOPTIONS",
linux.IP_MTU: "IP_MTU",
+ linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST",
},
linux.SOL_SOCKET: {
linux.SO_ERROR: "SO_ERROR",
@@ -631,6 +633,8 @@ var sockOptNames = map[uint64]abi.ValueSet{
linux.IPV6_UNICAST_IF: "IPV6_UNICAST_IF",
linux.MCAST_MSFILTER: "MCAST_MSFILTER",
linux.IPV6_ADDRFORM: "IPV6_ADDRFORM",
+ linux.IP6T_SO_GET_INFO: "IP6T_SO_GET_INFO",
+ linux.IP6T_SO_GET_ENTRIES: "IP6T_SO_GET_ENTRIES",
},
linux.SOL_NETLINK: {
linux.NETLINK_BROADCAST_ERROR: "NETLINK_BROADCAST_ERROR",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 68ca537c8..396744597 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -17,17 +17,16 @@
package strace
import (
- "encoding/binary"
"fmt"
"strconv"
"strings"
- "syscall"
"time"
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -91,7 +90,7 @@ func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, ma
}
b := make([]byte, size)
- amt, err := t.CopyIn(ar.Start, b)
+ amt, err := t.CopyInBytes(ar.Start, b)
if err != nil {
iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err)
continue
@@ -118,7 +117,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st
}
b := make([]byte, size)
- amt, err := t.CopyIn(addr, b)
+ amt, err := t.CopyInBytes(addr, b)
if err != nil {
return fmt.Sprintf("%#x (error decoding string: %s)", addr, err)
}
@@ -147,14 +146,14 @@ func fd(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectory()
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(t)
}
if fd == linux.AT_FDCWD {
wd := t.FSContext().WorkingDirectory()
var name string
if wd != nil {
- defer wd.DecRef()
+ defer wd.DecRef(t)
name, _ = wd.FullName(root)
} else {
name = "(unknown cwd)"
@@ -167,7 +166,7 @@ func fd(t *kernel.Task, fd int32) string {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, _ := file.Dirent.FullName(root)
return fmt.Sprintf("%#x %s", fd, name)
@@ -175,12 +174,12 @@ func fd(t *kernel.Task, fd int32) string {
func fdVFS2(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
vfsObj := root.Mount().Filesystem().VirtualFilesystem()
if fd == linux.AT_FDCWD {
wd := t.FSContext().WorkingDirectoryVFS2()
- defer wd.DecRef()
+ defer wd.DecRef(t)
name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
return fmt.Sprintf("AT_FDCWD %s", name)
@@ -191,7 +190,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
return fmt.Sprintf("%#x %s", fd, name)
@@ -199,7 +198,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
func fdpair(t *kernel.Task, addr usermem.Addr) string {
var fds [2]int32
- _, err := t.CopyIn(addr, &fds)
+ _, err := primitive.CopyInt32SliceIn(t, addr, fds[:])
if err != nil {
return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err)
}
@@ -209,7 +208,7 @@ func fdpair(t *kernel.Task, addr usermem.Addr) string {
func uname(t *kernel.Task, addr usermem.Addr) string {
var u linux.UtsName
- if _, err := t.CopyIn(addr, &u); err != nil {
+ if _, err := u.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err)
}
@@ -222,7 +221,7 @@ func utimensTimespec(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timespec
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
}
@@ -244,7 +243,7 @@ func timespec(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timespec
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
}
return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec)
@@ -256,7 +255,7 @@ func timeval(t *kernel.Task, addr usermem.Addr) string {
}
var tim linux.Timeval
- if _, err := t.CopyIn(addr, &tim); err != nil {
+ if _, err := tim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err)
}
@@ -268,8 +267,8 @@ func utimbuf(t *kernel.Task, addr usermem.Addr) string {
return "null"
}
- var utim syscall.Utimbuf
- if _, err := t.CopyIn(addr, &utim); err != nil {
+ var utim linux.Utime
+ if _, err := utim.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err)
}
@@ -282,7 +281,7 @@ func stat(t *kernel.Task, addr usermem.Addr) string {
}
var stat linux.Stat
- if _, err := t.CopyIn(addr, &stat); err != nil {
+ if _, err := stat.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err)
}
return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec))
@@ -294,7 +293,7 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string {
}
interval := timeval(t, addr)
- value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{})))
+ value := timeval(t, addr+usermem.Addr((*linux.Timeval)(nil).SizeBytes()))
return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
}
@@ -304,7 +303,7 @@ func itimerspec(t *kernel.Task, addr usermem.Addr) string {
}
interval := timespec(t, addr)
- value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{})))
+ value := timespec(t, addr+usermem.Addr((*linux.Timespec)(nil).SizeBytes()))
return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
}
@@ -330,7 +329,7 @@ func rusage(t *kernel.Task, addr usermem.Addr) string {
}
var ru linux.Rusage
- if _, err := t.CopyIn(addr, &ru); err != nil {
+ if _, err := ru.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err)
}
return fmt.Sprintf("%#x %+v", addr, ru)
@@ -342,7 +341,7 @@ func capHeader(t *kernel.Task, addr usermem.Addr) string {
}
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(addr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, addr); err != nil {
return fmt.Sprintf("%#x (error decoding header: %s)", addr, err)
}
@@ -367,7 +366,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
}
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err)
}
@@ -376,7 +375,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
switch hdr.Version {
case linux.LINUX_CAPABILITY_VERSION_1:
var data linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := data.CopyIn(t, dataAddr); err != nil {
return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
}
p = uint64(data.Permitted)
@@ -384,7 +383,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
e = uint64(data.Effective)
case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
var data [2]linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
}
p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32)
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index d9fb808c0..d23a0068a 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -28,7 +28,7 @@ import (
// CreateEpoll implements the epoll_create(2) linux syscall.
func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
file := epoll.NewEventPoll(t)
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: closeOnExec,
@@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
}
// Try to remove the entry.
- return e.RemoveEntry(epoll.FileIdentifier{file, fd})
+ return e.RemoveEntry(t, epoll.FileIdentifier{file, fd})
}
// WaitEpoll implements the epoll_wait(2) linux syscall.
@@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve
if epollfile == nil {
return nil, syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 217fcfef2..a2e441448 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -21,6 +21,7 @@ go_library(
"sys_identity.go",
"sys_inotify.go",
"sys_lseek.go",
+ "sys_membarrier.go",
"sys_mempolicy.go",
"sys_mmap.go",
"sys_mount.go",
@@ -56,6 +57,7 @@ go_library(
"sys_xattr.go",
"timespec.go",
],
+ marshal = True,
visibility = ["//:sandbox"],
deps = [
"//pkg/abi",
@@ -64,6 +66,8 @@ go_library(
"//pkg/bpf",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/rand",
"//pkg/safemem",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 64de56ac5..dab6207c0 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -36,8 +36,8 @@ var (
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
-func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
- known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *vfs.FileDescription) error {
+ known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op)
if err != nil {
return err
}
@@ -46,7 +46,7 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s
fs := f.Mount().Filesystem().VirtualFilesystem()
root := vfs.RootFromContext(t)
name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
- log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, ioerr, ioerr, op, name)
partialResultOnce.Do(partialResultMetric.Increment)
}
return nil
@@ -56,15 +56,15 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
-func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
- known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+func handleIOError(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *fs.File) error {
+ known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op)
if err != nil {
return err
}
if !known {
// An unknown error is encountered with a partial read/write.
name, _ := f.Dirent.FullName(nil /* ignore chroot */)
- log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, ioerr, ioerr, op, name, f.FileOperations)
partialResultOnce.Do(partialResultMetric.Increment)
}
return nil
@@ -147,7 +147,7 @@ func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op s
}
switch err.(type) {
- case kernel.SyscallRestartErrno:
+ case syserror.SyscallRestartErrno:
// Identical to the EINTR case.
return true, nil
}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index ea4f9b1a7..36902d177 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{
63: syscalls.Supported("uname", Uname),
64: syscalls.Supported("semget", Semget),
65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
- 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
67: syscalls.Supported("shmdt", Shmdt),
68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
@@ -138,7 +138,7 @@ var AMD64 = &kernel.SyscallTable{
83: syscalls.Supported("mkdir", Mkdir),
84: syscalls.Supported("rmdir", Rmdir),
85: syscalls.Supported("creat", Creat),
- 86: syscalls.Supported("link", Link),
+ 86: syscalls.PartiallySupported("link", Link, "Limited support with Gofer. Link count and linked files may get out of sync because gVisor is not aware of external hardlinks.", nil),
87: syscalls.Supported("unlink", Unlink),
88: syscalls.Supported("symlink", Symlink),
89: syscalls.Supported("readlink", Readlink),
@@ -305,9 +305,9 @@ var AMD64 = &kernel.SyscallTable{
250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
- 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
- 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
- 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+ 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+ 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+ 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
257: syscalls.Supported("openat", Openat),
258: syscalls.Supported("mkdirat", Mkdirat),
@@ -317,7 +317,7 @@ var AMD64 = &kernel.SyscallTable{
262: syscalls.Supported("fstatat", Fstatat),
263: syscalls.Supported("unlinkat", Unlinkat),
264: syscalls.Supported("renameat", Renameat),
- 265: syscalls.Supported("linkat", Linkat),
+ 265: syscalls.PartiallySupported("linkat", Linkat, "See link(2).", nil),
266: syscalls.Supported("symlinkat", Symlinkat),
267: syscalls.Supported("readlinkat", Readlinkat),
268: syscalls.Supported("fchmodat", Fchmodat),
@@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{
270: syscalls.Supported("pselect", Pselect),
271: syscalls.Supported("ppoll", Ppoll),
272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
- 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
- 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 273: syscalls.Supported("set_robust_list", SetRobustList),
+ 274: syscalls.Supported("get_robust_list", GetRobustList),
275: syscalls.Supported("splice", Splice),
276: syscalls.Supported("tee", Tee),
277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
@@ -346,7 +346,7 @@ var AMD64 = &kernel.SyscallTable{
291: syscalls.Supported("epoll_create1", EpollCreate1),
292: syscalls.Supported("dup3", Dup3),
293: syscalls.Supported("pipe2", Pipe2),
- 294: syscalls.Supported("inotify_init1", InotifyInit1),
+ 294: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
295: syscalls.Supported("preadv", Preadv),
296: syscalls.Supported("pwritev", Pwritev),
297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
@@ -376,7 +376,7 @@ var AMD64 = &kernel.SyscallTable{
321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
322: syscalls.Supported("execveat", Execveat),
323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
- 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
+ 324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
// Syscalls implemented after 325 are "backports" from versions
@@ -454,9 +454,9 @@ var ARM64 = &kernel.SyscallTable{
23: syscalls.Supported("dup", Dup),
24: syscalls.Supported("dup3", Dup3),
25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
- 26: syscalls.Supported("inotify_init1", InotifyInit1),
- 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
- 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+ 26: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+ 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+ 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
@@ -527,8 +527,8 @@ var ARM64 = &kernel.SyscallTable{
96: syscalls.Supported("set_tid_address", SetTidAddress),
97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
- 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
- 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 99: syscalls.Supported("set_robust_list", SetRobustList),
+ 100: syscalls.Supported("get_robust_list", GetRobustList),
101: syscalls.Supported("nanosleep", Nanosleep),
102: syscalls.Supported("getitimer", Getitimer),
103: syscalls.Supported("setitimer", Setitimer),
@@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{
188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
190: syscalls.Supported("semget", Semget),
- 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
@@ -695,7 +695,7 @@ var ARM64 = &kernel.SyscallTable{
280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
281: syscalls.Supported("execveat", Execveat),
282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
- 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
+ 283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
// Syscalls after 284 are "backports" from versions of Linux after 4.4.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index d781d6a04..0bf313a13 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -15,8 +15,9 @@
package linux
import (
- "encoding/binary"
-
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -27,59 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// I/O commands.
-const (
- _IOCB_CMD_PREAD = 0
- _IOCB_CMD_PWRITE = 1
- _IOCB_CMD_FSYNC = 2
- _IOCB_CMD_FDSYNC = 3
- _IOCB_CMD_NOOP = 6
- _IOCB_CMD_PREADV = 7
- _IOCB_CMD_PWRITEV = 8
-)
-
-// I/O flags.
-const (
- _IOCB_FLAG_RESFD = 1
-)
-
-// ioCallback describes an I/O request.
-//
-// The priority field is currently ignored in the implementation below. Also
-// note that the IOCB_FLAG_RESFD feature is not supported.
-type ioCallback struct {
- Data uint64
- Key uint32
- Reserved1 uint32
-
- OpCode uint16
- ReqPrio int16
- FD int32
-
- Buf uint64
- Bytes uint64
- Offset int64
-
- Reserved2 uint64
- Flags uint32
-
- // eventfd to signal if IOCB_FLAG_RESFD is set in flags.
- ResFD int32
-}
-
-// ioEvent describes an I/O result.
-//
-// +stateify savable
-type ioEvent struct {
- Data uint64
- Obj uint64
- Result int64
- Result2 int64
-}
-
-// ioEventSize is the size of an ioEvent encoded.
-var ioEventSize = binary.Size(ioEvent{})
-
// IoSetup implements linux syscall io_setup(2).
func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nrEvents := args[0].Int()
@@ -89,7 +37,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
//
// The context pointer _must_ be zero initially.
var idIn uint64
- if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+ if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
return 0, nil, err
}
if idIn != 0 {
@@ -102,7 +50,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Copy out the new ID.
- if _, err := t.CopyOut(idAddr, &id); err != nil {
+ if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
t.MemoryManager().DestroyAIOContext(t, id)
return 0, nil, err
}
@@ -192,10 +140,10 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
}
- ev := v.(*ioEvent)
+ ev := v.(*linux.IOEvent)
// Copy out the result.
- if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+ if _, err := ev.CopyOut(t, eventsAddr); err != nil {
if count > 0 {
return uintptr(count), nil, nil
}
@@ -204,7 +152,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
// Keep rolling.
- eventsAddr += usermem.Addr(ioEventSize)
+ eventsAddr += usermem.Addr(linux.IOEventSize)
}
// Everything finished.
@@ -231,7 +179,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl
}
// memoryFor returns appropriate memory for the given callback.
-func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
bytes := int(cb.Bytes)
if bytes < 0 {
// Linux also requires that this field fit in ssize_t.
@@ -242,17 +190,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
// we have no guarantee that t's AddressSpace will be active during the
// I/O.
switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE:
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
AddressSpaceActive: false,
})
- case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV:
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
AddressSpaceActive: false,
})
- case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP:
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
return usermem.IOSequence{}, nil
default:
@@ -261,70 +209,78 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
}
}
-func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
- if ctx.Dead() {
- ctx.CancelPendingRequest()
- return
- }
- ev := &ioEvent{
- Data: cb.Data,
- Obj: uint64(cbAddr),
- }
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.ENOSYS
+}
- // Construct a context.Context that will not be interrupted if t is
- // interrupted.
- c := t.AsyncContext()
+// LINT.IfChange
- var err error
- switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV:
- ev.Result, err = file.Preadv(c, ioseq, cb.Offset)
- case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
- ev.Result, err = file.Pwritev(c, ioseq, cb.Offset)
- case _IOCB_CMD_FSYNC:
- err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll)
- case _IOCB_CMD_FDSYNC:
- err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData)
- }
+func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ if actx.Dead() {
+ actx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
- // Update the result.
- if err != nil {
- err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
- ev.Result = -int64(kernel.ExtractErrno(err, 0))
- }
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_FSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ case linux.IOCB_CMD_FDSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
+ }
- file.DecRef()
+ // Update the result.
+ if err != nil {
+ err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
- // Queue the result for delivery.
- ctx.FinishRequest(ev)
+ file.DecRef(ctx)
- // Notify the event file if one was specified. This needs to happen
- // *after* queueing the result to avoid racing with the thread we may
- // wake up.
- if eventFile != nil {
- eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
- eventFile.DecRef()
+ // Queue the result for delivery.
+ actx.FinishRequest(ev)
+
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFile != nil {
+ eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+ eventFile.DecRef(ctx)
+ }
}
}
// submitCallback processes a single callback.
-func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
file := t.GetFile(cb.FD)
if file == nil {
// File not found.
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Was there an eventFD? Extract it.
var eventFile *fs.File
- if cb.Flags&_IOCB_FLAG_RESFD != 0 {
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
eventFile = t.GetFile(cb.ResFD)
if eventFile == nil {
// Bad FD.
return syserror.EBADF
}
- defer eventFile.DecRef()
+ defer eventFile.DecRef(t)
// Check that it is an eventfd.
if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
@@ -340,7 +296,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Check offset for reads/writes.
switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
if cb.Offset < 0 {
return syserror.EINVAL
}
@@ -366,7 +322,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Perform the request asynchronously.
file.IncRef()
- fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) })
+ t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))
// All set.
return nil
@@ -383,21 +339,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
for i := int32(0); i < nrEvents; i++ {
- // Copy in the address.
- cbAddrNative := t.Arch().Native(0)
- if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
- if i > 0 {
- // Some successful.
- return uintptr(i), nil, nil
+ // Copy in the callback address.
+ var cbAddr usermem.Addr
+ switch t.Arch().Width() {
+ case 8:
+ var cbAddrP primitive.Uint64
+ if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
}
- // Nothing done.
- return 0, nil, err
+ cbAddr = usermem.Addr(cbAddrP)
+ default:
+ return 0, nil, syserror.ENOSYS
}
// Copy in this callback.
- var cb ioCallback
- cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
- if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+ var cb linux.IOCallback
+ if _, err := cb.CopyIn(t, cbAddr); err != nil {
if i > 0 {
// Some have been successful.
@@ -424,10 +386,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return uintptr(nrEvents), nil, nil
}
-// IoCancel implements linux syscall io_cancel(2).
-//
-// It is not presently supported (ENOSYS indicates no support on this
-// architecture).
-func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- return 0, nil, syserror.ENOSYS
-}
+// LINT.ThenChange(vfs2/aio.go)
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index adf5ea5f2..d3b85e11b 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -45,7 +45,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
dataAddr := args[1].Pointer()
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return 0, nil, err
}
// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
@@ -65,7 +65,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
Permitted: uint32(p),
Inheritable: uint32(i),
}
- _, err = t.CopyOut(dataAddr, &data)
+ _, err = data.CopyOut(t, dataAddr)
return 0, nil, err
case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
@@ -88,12 +88,12 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
Inheritable: uint32(i >> 32),
},
}
- _, err = t.CopyOut(dataAddr, &data)
+ _, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:])
return 0, nil, err
default:
hdr.Version = linux.HighestCapabilityVersion
- if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
return 0, nil, err
}
if dataAddr != 0 {
@@ -109,7 +109,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
dataAddr := args[1].Pointer()
var hdr linux.CapUserHeader
- if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
return 0, nil, err
}
switch hdr.Version {
@@ -118,7 +118,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.EPERM
}
var data linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := data.CopyIn(t, dataAddr); err != nil {
return 0, nil, err
}
p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
@@ -131,7 +131,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.EPERM
}
var data [2]linux.CapUserData
- if _, err := t.CopyIn(dataAddr, &data); err != nil {
+ if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
return 0, nil, err
}
p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
@@ -141,7 +141,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
default:
hdr.Version = linux.HighestCapabilityVersion
- if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+ if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
return 0, nil, err
}
return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index ed3413ca6..3b4f879e4 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
event.SetFlags(fs.SettableFileFlags{
NonBlocking: flags&linux.EFD_NONBLOCK != 0,
})
- defer event.DecRef()
+ defer event.DecRef(t)
fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 35a98212a..519066a47 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
@@ -40,7 +41,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent,
// Common case: we are accessing a file in the root.
root := t.FSContext().RootDirectory()
err := fn(root, root, name, linux.MaxSymlinkTraversals)
- root.DecRef()
+ root.DecRef(t)
return err
} else if dir == "." && dirFD == linux.AT_FDCWD {
// Common case: we are accessing a file relative to the current
@@ -48,8 +49,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent,
wd := t.FSContext().WorkingDirectory()
root := t.FSContext().RootDirectory()
err := fn(root, wd, name, linux.MaxSymlinkTraversals)
- wd.DecRef()
- root.DecRef()
+ wd.DecRef(t)
+ root.DecRef(t)
return err
}
@@ -83,6 +84,7 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro
}
rel = f.Dirent
if !fs.IsDir(rel.Inode.StableAttr) {
+ f.DecRef(t)
return syserror.ENOTDIR
}
}
@@ -97,19 +99,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro
} else {
d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
}
- root.DecRef()
+ root.DecRef(t)
if wd != nil {
- wd.DecRef()
+ wd.DecRef(t)
}
if f != nil {
- f.DecRef()
+ f.DecRef(t)
}
if err != nil {
return err
}
err = fn(root, d, remainingTraversals)
- d.DecRef()
+ d.DecRef(t)
return err
}
@@ -184,9 +186,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
file, err := d.Inode.GetFile(t, d, fileFlags)
if err != nil {
- return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Success.
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
@@ -242,7 +244,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode
if err != nil {
return err
}
- file.DecRef()
+ file.DecRef(t)
return nil
case linux.ModeNamedPipe:
@@ -332,7 +334,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
if err != nil {
break
}
- defer found.DecRef()
+ defer found.DecRef(t)
// We found something (possibly a symlink). If the
// O_EXCL flag was passed, then we can immediately
@@ -357,7 +359,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
resolved, err = found.Inode.Getlink(t)
if err == nil {
// No more resolution necessary.
- defer resolved.DecRef()
+ defer resolved.DecRef(t)
break
}
if err != fs.ErrResolveViaReadlink {
@@ -384,7 +386,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
if err != nil {
break
}
- defer newParent.DecRef()
+ defer newParent.DecRef(t)
// Repeat the process with the parent and name of the
// symlink target.
@@ -414,9 +416,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
// Create a new fs.File.
newFile, err = found.Inode.GetFile(t, found, fileFlags)
if err != nil {
- return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
- defer newFile.DecRef()
+ defer newFile.DecRef(t)
case syserror.ENOENT:
// File does not exist. Proceed with creation.
@@ -432,7 +434,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
// No luck, bail.
return err
}
- defer newFile.DecRef()
+ defer newFile.DecRef(t)
found = newFile.Dirent
default:
return err
@@ -596,24 +598,24 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Shared flags between file and socket.
switch request {
case linux.FIONCLEX:
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: false,
})
return 0, nil, nil
case linux.FIOCLEX:
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: true,
})
return 0, nil, nil
case linux.FIONBIO:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.Flags()
@@ -627,7 +629,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FIOASYNC:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
flags := file.Flags()
@@ -641,15 +643,14 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FIOSETOWN, linux.SIOCSPGRP:
var set int32
- if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
return 0, nil, err
}
fSetOwn(t, file, set)
return 0, nil, nil
case linux.FIOGETOWN, linux.SIOCGPGRP:
- who := fGetOwn(t, file)
- _, err := t.CopyOut(args[2].Pointer(), &who)
+ _, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
return 0, nil, err
default:
@@ -671,9 +672,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
addr := args[0].Pointer()
size := args[1].SizeT()
cwd := t.FSContext().WorkingDirectory()
- defer cwd.DecRef()
+ defer cwd.DecRef(t)
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
// Get our fullname from the root and preprend unreachable if the root was
// unreachable from our current dirent this is the same behavior as on linux.
@@ -694,7 +695,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
// Top it off with a terminator.
- _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+ _, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00"))
return uintptr(bytes + 1), nil, err
}
@@ -722,7 +723,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return err
}
- t.FSContext().SetRootDirectory(d)
+ t.FSContext().SetRootDirectory(t, d)
return nil
})
}
@@ -747,7 +748,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return err
}
- t.FSContext().SetWorkingDirectory(d)
+ t.FSContext().SetWorkingDirectory(t, d)
return nil
})
}
@@ -760,7 +761,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is it a directory?
if !fs.IsDir(file.Dirent.Inode.StableAttr) {
@@ -772,7 +773,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, err
}
- t.FSContext().SetWorkingDirectory(file.Dirent)
+ t.FSContext().SetWorkingDirectory(t, file.Dirent)
return 0, nil, nil
}
@@ -787,11 +788,11 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- file, _ := t.FDTable().Remove(fd)
+ file, _ := t.FDTable().Remove(t, fd)
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Flush(t)
return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
@@ -805,7 +806,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
if err != nil {
@@ -826,7 +827,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if oldFile == nil {
return 0, nil, syserror.EBADF
}
- defer oldFile.DecRef()
+ defer oldFile.DecRef(t)
return uintptr(newfd), nil, nil
}
@@ -850,7 +851,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if oldFile == nil {
return 0, nil, syserror.EBADF
}
- defer oldFile.DecRef()
+ defer oldFile.DecRef(t)
err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
if err != nil {
@@ -900,14 +901,20 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 {
//
// If who is positive, it represents a PID. If negative, it represents a PGID.
// If the PID or PGID is invalid, the owner is silently unset.
-func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
a := file.Async(fasync.New).(*fasync.FileAsync)
if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return syserror.EINVAL
+ }
pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
a.SetOwnerProcessGroup(t, pg)
+ } else {
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+ a.SetOwnerThreadGroup(t, tg)
}
- tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
- a.SetOwnerThreadGroup(t, tg)
+ return nil
}
// Fcntl implements linux syscall fcntl(2).
@@ -919,7 +926,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
@@ -935,10 +942,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
- return 0, nil, nil
+ return 0, nil, err
case linux.F_GETFL:
return uintptr(file.Flags().ToLinux()), nil, nil
case linux.F_SETFL:
@@ -956,7 +963,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Copy in the lock request.
flockAddr := args[2].Pointer()
var flock linux.Flock
- if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ if _, err := flock.CopyIn(t, flockAddr); err != nil {
return 0, nil, err
}
@@ -998,9 +1005,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
- // The lock uid is that of the Task's FDTable.
- lockUniqueID := lock.UniqueID(t.FDTable().ID())
-
// These locks don't block; execute the non-blocking operation using the inode's lock
// context directly.
switch flock.Type {
@@ -1010,12 +1014,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
return 0, nil, syserror.EAGAIN
}
} else {
// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
@@ -1026,18 +1030,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
return 0, nil, syserror.EAGAIN
}
} else {
// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
return 0, nil, nil
case linux.F_UNLCK:
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
return 0, nil, nil
default:
return 0, nil, syserror.EINVAL
@@ -1045,17 +1049,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_GETOWN:
return uintptr(fGetOwn(t, file)), nil, nil
case linux.F_SETOWN:
- fSetOwn(t, file, args[2].Int())
- return 0, nil, nil
+ return 0, nil, fSetOwn(t, file, args[2].Int())
case linux.F_GETOWN_EX:
addr := args[2].Pointer()
owner := fGetOwnEx(t, file)
- _, err := t.CopyOut(addr, &owner)
+ _, err := owner.CopyOut(t, addr)
return 0, nil, err
case linux.F_SETOWN_EX:
addr := args[2].Pointer()
var owner linux.FOwnerEx
- n, err := t.CopyIn(addr, &owner)
+ _, err := owner.CopyIn(t, addr)
if err != nil {
return 0, nil, err
}
@@ -1067,21 +1070,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.ESRCH
}
a.SetOwnerTask(t, task)
- return uintptr(n), nil, nil
+ return 0, nil, nil
case linux.F_OWNER_PID:
tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
if tg == nil {
return 0, nil, syserror.ESRCH
}
a.SetOwnerThreadGroup(t, tg)
- return uintptr(n), nil, nil
+ return 0, nil, nil
case linux.F_OWNER_PGRP:
pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
if pg == nil {
return 0, nil, syserror.ESRCH
}
a.SetOwnerProcessGroup(t, pg)
- return uintptr(n), nil, nil
+ return 0, nil, nil
default:
return 0, nil, syserror.EINVAL
}
@@ -1114,17 +1117,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
-// LINT.ThenChange(vfs2/fd.go)
-
-const (
- _FADV_NORMAL = 0
- _FADV_RANDOM = 1
- _FADV_SEQUENTIAL = 2
- _FADV_WILLNEED = 3
- _FADV_DONTNEED = 4
- _FADV_NOREUSE = 5
-)
-
// Fadvise64 implements linux syscall fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
@@ -1141,7 +1133,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// If the FD refers to a pipe or FIFO, return error.
if fs.IsPipe(file.Dirent.Inode.StableAttr) {
@@ -1149,12 +1141,12 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
switch advice {
- case _FADV_NORMAL:
- case _FADV_RANDOM:
- case _FADV_SEQUENTIAL:
- case _FADV_WILLNEED:
- case _FADV_DONTNEED:
- case _FADV_NOREUSE:
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
default:
return 0, nil, syserror.EINVAL
}
@@ -1163,6 +1155,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
+// LINT.ThenChange(vfs2/fd.go)
+
// LINT.IfChange
func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
@@ -1182,7 +1176,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode
switch err {
case nil:
// The directory existed.
- defer f.DecRef()
+ defer f.DecRef(t)
return syserror.EEXIST
case syserror.EACCES:
// Permission denied while walking to the directory.
@@ -1360,7 +1354,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32
if target == nil {
return syserror.EBADF
}
- defer target.DecRef()
+ defer target.DecRef(t)
if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
return err
}
@@ -1613,7 +1607,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Reject truncation if the file flags do not permit this operation.
// This is different from truncate(2) above.
@@ -1741,7 +1735,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return chown(t, file.Dirent, uid, gid)
}
@@ -1779,7 +1773,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, chown(t, file.Dirent, uid, gid)
}
@@ -1844,7 +1838,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, chmod(t, file.Dirent, mode)
}
@@ -1904,10 +1898,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso
if f == nil {
return syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
}
@@ -1929,7 +1923,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times linux.Utime
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := times.CopyIn(t, timesAddr); err != nil {
return 0, nil, err
}
ts = fs.TimeSpec{
@@ -1949,7 +1943,7 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
ts = fs.TimeSpec{
@@ -1977,7 +1971,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timespec
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
@@ -2011,7 +2005,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ts := defaultSetToSystemTimeSpec()
if timesAddr != 0 {
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return 0, nil, err
}
if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
@@ -2099,7 +2093,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
if offset < 0 || length <= 0 {
return 0, nil, syserror.EINVAL
@@ -2152,27 +2146,11 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// flock(2): EBADF fd is not an open file descriptor.
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
nonblocking := operation&linux.LOCK_NB != 0
operation &^= linux.LOCK_NB
- // flock(2):
- // Locks created by flock() are associated with an open file table entry. This means that
- // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
- // same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
- // the lock is released either by an explicit LOCK_UN operation on any of these duplicate
- // descriptors, or when all such descriptors have been closed.
- //
- // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
- // these descriptors are treated independently by flock(). An attempt to lock the file using
- // one of these file descriptors may be denied by a lock that the calling process has already placed via
- // another descriptor.
- //
- // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
- // and fork(2).
- lockUniqueID := lock.UniqueID(file.UniqueID)
-
// A BSD style lock spans the entire file.
rng := lock.LockRange{
Start: 0,
@@ -2183,29 +2161,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.LOCK_EX:
if nonblocking {
// Since we're nonblocking we pass a nil lock.Blocker implementation.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
return 0, nil, syserror.EWOULDBLOCK
}
} else {
// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
case linux.LOCK_SH:
if nonblocking {
// Since we're nonblocking we pass a nil lock.Blocker implementation.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
return 0, nil, syserror.EWOULDBLOCK
}
} else {
// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
case linux.LOCK_UN:
- file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+ file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
default:
// flock(2): EINVAL operation is invalid.
return 0, nil, syserror.EINVAL
@@ -2251,8 +2229,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, err
}
- defer dirent.DecRef()
- defer file.DecRef()
+ defer dirent.DecRef(t)
+ defer file.DecRef(t)
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b68261f72..f39ce0639 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -73,8 +73,8 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo
err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
}
- t.Futex().WaitComplete(w)
- return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ t.Futex().WaitComplete(w, t)
+ return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
@@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
}
remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
- t.Futex().WaitComplete(w)
+ t.Futex().WaitComplete(w, t)
if err == nil {
return 0, nil
}
@@ -110,7 +110,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
// The wait duration was absolute, restart with the original arguments.
if forever {
- return 0, kernel.ERESTARTSYS
+ return 0, syserror.ERESTARTSYS
}
// The wait duration was relative, restart with the remaining duration.
@@ -121,7 +121,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
val: val,
mask: mask,
})
- return 0, kernel.ERESTART_RESTARTBLOCK
+ return 0, syserror.ERESTART_RESTARTBLOCK
}
func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
@@ -148,8 +148,8 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A
timer.Destroy()
}
- t.Futex().WaitComplete(w)
- return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ t.Futex().WaitComplete(w, t)
+ return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
@@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
switch cmd {
case linux.FUTEX_WAIT:
// WAIT uses a relative timeout.
- mask = ^uint32(0)
+ mask = linux.FUTEX_BITSET_MATCH_ANY
var timeoutDur time.Duration
if !forever {
timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
@@ -286,3 +286,53 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.ENOSYS
}
}
+
+// SetRobustList implements linux syscall set_robust_list(2).
+func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // Despite the syscall using the name 'pid' for this variable, it is
+ // very much a tid.
+ head := args[0].Pointer()
+ length := args[1].SizeT()
+
+ if length != uint(linux.SizeOfRobustListHead) {
+ return 0, nil, syserror.EINVAL
+ }
+ t.SetRobustList(head)
+ return 0, nil, nil
+}
+
+// GetRobustList implements linux syscall get_robust_list(2).
+func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // Despite the syscall using the name 'pid' for this variable, it is
+ // very much a tid.
+ tid := args[0].Int()
+ headAddr := args[1].Pointer()
+ sizeAddr := args[2].Pointer()
+
+ if tid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ot := t
+ if tid != 0 {
+ if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ // Copy out head pointer.
+ head := t.Arch().Native(uintptr(ot.GetRobustList()))
+ if _, err := head.CopyOut(t, headAddr); err != nil {
+ return 0, nil, err
+ }
+
+ // Copy out size, which is a constant. Note that while size isn't
+ // an address, it is defined as the arch-dependent size_t, so it
+ // needs to be converted to a native-sized int.
+ size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
+ if _, err := size.CopyOut(t, sizeAddr); err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index b126fecc0..b25f7d881 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -19,7 +19,6 @@ import (
"io"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -68,7 +67,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
if dir == nil {
return 0, syserror.EBADF
}
- defer dir.DecRef()
+ defer dir.DecRef(t)
w := &usermem.IOReadWriter{
Ctx: t,
@@ -82,7 +81,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
ds := newDirentSerializer(f, w, t.Arch(), size)
rerr := dir.Readdir(t, ds)
- switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+ switch err := handleIOError(t, ds.Written() > 0, rerr, syserror.ERESTARTSYS, "getdents", dir); err {
case nil:
dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
return uintptr(ds.Written()), nil
@@ -93,19 +92,23 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
}
}
-// oldDirentHdr is a fixed sized header matching the fixed size
-// fields found in the old linux dirent struct.
+// oldDirentHdr is a fixed sized header matching the fixed size fields found in
+// the old linux dirent struct.
+//
+// +marshal
type oldDirentHdr struct {
Ino uint64
Off uint64
- Reclen uint16
+ Reclen uint16 `marshal:"unaligned"` // Struct ends mid-word.
}
-// direntHdr is a fixed sized header matching the fixed size
-// fields found in the new linux dirent struct.
+// direntHdr is a fixed sized header matching the fixed size fields found in the
+// new linux dirent struct.
+//
+// +marshal
type direntHdr struct {
OldHdr oldDirentHdr
- Typ uint8
+ Typ uint8 `marshal:"unaligned"` // Struct ends mid-word.
}
// dirent contains the data pointed to by a new linux dirent struct.
@@ -134,20 +137,20 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
// the old linux dirent format.
func smallestDirent(a arch.Context) uint {
d := dirent{}
- return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+ return uint(d.Hdr.OldHdr.SizeBytes()) + a.Width() + 1
}
// smallestDirent64 returns the size of the smallest possible dirent using
// the new linux dirent format.
func smallestDirent64(a arch.Context) uint {
d := dirent{}
- return uint(binary.Size(d.Hdr)) + a.Width()
+ return uint(d.Hdr.SizeBytes()) + a.Width()
}
// padRec pads the name field until the rec length is a multiple of the width,
// which must be a power of 2. It returns the padded rec length.
func (d *dirent) padRec(width int) uint16 {
- a := int(binary.Size(d.Hdr)) + len(d.Name)
+ a := d.Hdr.SizeBytes() + len(d.Name)
r := (a + width) &^ (width - 1)
padding := r - a
d.Name = append(d.Name, make([]byte, padding)...)
@@ -157,7 +160,7 @@ func (d *dirent) padRec(width int) uint16 {
// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
// linux dirent format. Returns the number of bytes serialized or an error.
func (d *dirent) Serialize64(w io.Writer) (int, error) {
- n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+ n1, err := d.Hdr.WriteTo(w)
if err != nil {
return 0, err
}
@@ -165,14 +168,14 @@ func (d *dirent) Serialize64(w io.Writer) (int, error) {
if err != nil {
return 0, err
}
- return n1 + n2, nil
+ return int(n1) + n2, nil
}
// Serialize serializes a Dirent struct to a byte slice, using the old linux
// dirent format.
// Returns the number of bytes serialized or an error.
func (d *dirent) Serialize(w io.Writer) (int, error) {
- n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+ n1, err := d.Hdr.OldHdr.WriteTo(w)
if err != nil {
return 0, err
}
@@ -184,7 +187,7 @@ func (d *dirent) Serialize(w io.Writer) (int, error) {
if err != nil {
return 0, err
}
- return n1 + n2 + n3, nil
+ return int(n1) + n2 + n3, nil
}
// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 715ac45e6..a29d307e5 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -49,13 +49,13 @@ func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
- if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+ if _, err := ruid.CopyOut(t, ruidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(euidAddr, euid); err != nil {
+ if _, err := euid.CopyOut(t, euidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(suidAddr, suid); err != nil {
+ if _, err := suid.CopyOut(t, suidAddr); err != nil {
return 0, nil, err
}
return 0, nil, nil
@@ -84,13 +84,13 @@ func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
- if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+ if _, err := rgid.CopyOut(t, rgidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(egidAddr, egid); err != nil {
+ if _, err := egid.CopyOut(t, egidAddr); err != nil {
return 0, nil, err
}
- if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+ if _, err := sgid.CopyOut(t, sgidAddr); err != nil {
return 0, nil, err
}
return 0, nil, nil
@@ -157,7 +157,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
for i, kgid := range kgids {
gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
}
- if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+ if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil {
return 0, nil, err
}
return uintptr(len(gids)), nil, nil
@@ -173,7 +173,7 @@ func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, t.SetExtraGIDs(nil)
}
gids := make([]auth.GID, size)
- if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+ if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil {
return 0, nil, err
}
return 0, nil, t.SetExtraGIDs(gids)
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index b2c7b3444..cf47bb9dd 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
NonBlocking: flags&linux.IN_NONBLOCK != 0,
}
n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
- defer n.DecRef()
+ defer n.DecRef(t)
fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
CloseOnExec: flags&linux.IN_CLOEXEC != 0,
@@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
ino, ok := file.FileOperations.(*fs.Inotify)
if !ok {
// Not an inotify fd.
- file.DecRef()
+ file.DecRef(t)
return nil, nil, syserror.EINVAL
}
@@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
- return 0, nil, ino.RmWatch(wd)
+ defer file.DecRef(t)
+ return 0, nil, ino.RmWatch(t, wd)
}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 3f7691eae..0046347cb 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
var sw fs.SeekWhence
switch whence {
@@ -48,7 +48,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
offset, serr := file.Seek(t, sw, offset)
- err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+ err := handleIOError(t, false /* partialResult */, serr, syserror.ERESTARTSYS, "lseek", file)
if err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go
new file mode 100644
index 000000000..63ee5d435
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_membarrier.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Membarrier implements syscall membarrier(2).
+func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ cmd := args[0].Int()
+ flags := args[1].Uint()
+
+ switch cmd {
+ case linux.MEMBARRIER_CMD_QUERY:
+ if flags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ var supportedCommands uintptr
+ if t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+ supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL |
+ linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED |
+ linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |
+ linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED |
+ linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
+ }
+ if t.RSeqAvailable() {
+ supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ |
+ linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ }
+ return supportedCommands, nil, nil
+ case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ if flags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+ return 0, nil, syserror.EINVAL
+ }
+ if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() {
+ return 0, nil, syserror.EPERM
+ }
+ return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier()
+ case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ if flags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+ return 0, nil, syserror.EINVAL
+ }
+ // no-op
+ return 0, nil, nil
+ case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ if flags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+ return 0, nil, syserror.EINVAL
+ }
+ t.MemoryManager().EnableMembarrierPrivate()
+ return 0, nil, nil
+ case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.RSeqAvailable() {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.MemoryManager().IsMembarrierRSeqEnabled() {
+ return 0, nil, syserror.EPERM
+ }
+ // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have
+ // the ability to preempt specific CPUs.
+ return 0, nil, t.Kernel().Platform.PreemptAllCPUs()
+ case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ if flags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if !t.RSeqAvailable() {
+ return 0, nil, syserror.EINVAL
+ }
+ t.MemoryManager().EnableMembarrierRSeq()
+ return 0, nil, nil
+ default:
+ // Probably a command we don't implement.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 91694d374..cd8dfdfa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
}
defer func() {
if opts.MappingIdentity != nil {
- opts.MappingIdentity.DecRef()
+ opts.MappingIdentity.DecRef(t)
}
}()
@@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
flags := file.Flags()
// mmap unconditionally requires that the FD is readable.
@@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with a special mappable.
+ opts.Offset = 0
+ m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
+ if err != nil {
+ return 0, nil, err
+ }
+ opts.MappingIdentity = m // transfers ownership of m to opts
+ opts.Mappable = m
}
rv, err := t.MemoryManager().MMap(t, opts)
@@ -239,7 +248,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
return 0, nil, syserror.ENOMEM
}
resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
- _, err := t.CopyOut(vec, resident)
+ _, err := t.CopyOutBytes(vec, resident)
return 0, nil, err
}
@@ -267,7 +276,7 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
})
// MSync calls fsync, the same interrupt conversion rules apply, see
// mm/msync.c, fsync POSIX.1-2008.
- return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
// Mlock implements linux syscall mlock(2).
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index eb5ff48f5..bd0633564 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}); err != nil {
// Something went wrong. Drop our ref on rootInode before
// returning the error.
- rootInode.DecRef()
+ rootInode.DecRef(t)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 43c510930..849a47476 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -34,10 +35,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
r.SetFlags(linuxToFlags(flags).Settable())
- defer r.DecRef()
+ defer r.DecRef(t)
w.SetFlags(linuxToFlags(flags).Settable())
- defer w.DecRef()
+ defer w.DecRef(t)
fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -46,10 +47,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
return 0, err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if file, _ := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ if file, _ := t.FDTable().Remove(t, fd); file != nil {
+ file.DecRef(t)
}
}
return 0, err
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index f0198141c..254f4c9f9 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
if ch == nil {
- defer file.DecRef()
+ defer file.DecRef(t)
} else {
state.file = file
state.waiter, _ = waiter.NewChannelEntry(ch)
@@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
// releaseState releases all the pollState in "state".
-func releaseState(state []pollState) {
+func releaseState(t *kernel.Task, state []pollState) {
for i := range state {
if state[i].file != nil {
state[i].file.EventUnregister(&state[i].waiter)
- state[i].file.DecRef()
+ state[i].file.DecRef(t)
}
}
}
@@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
// result, we stop registering for events but still go through all files
// to get their ready masks.
state := make([]pollState, len(pfd))
- defer releaseState(state)
+ defer releaseState(t, state)
n := uintptr(0)
for i := range pfd {
initReadiness(t, &pfd[i], &state[i], ch)
@@ -162,7 +162,7 @@ func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
pfd := make([]linux.PollFD, nfds)
if nfds > 0 {
- if _, err := t.CopyIn(addr, &pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
return nil, err
}
}
@@ -189,7 +189,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
- if _, err := t.CopyOut(addr, pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
@@ -202,7 +202,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
set := make([]byte, nBytes)
if addr != 0 {
- if _, err := t.CopyIn(addr, &set); err != nil {
+ if _, err := t.CopyInBytes(addr, set); err != nil {
return nil, err
}
// If we only use part of the last byte, mask out the extraneous bits.
@@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
if file == nil {
return 0, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
var mask int16
if (rV & m) != 0 {
@@ -329,19 +329,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// Copy updated vectors back.
if readFDs != 0 {
- if _, err := t.CopyOut(readFDs, r); err != nil {
+ if _, err := t.CopyOutBytes(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
- if _, err := t.CopyOut(writeFDs, w); err != nil {
+ if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
- if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
return 0, err
}
}
@@ -410,7 +410,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration
nfds: nfds,
timeout: remainingTimeout,
})
- return 0, kernel.ERESTART_RESTARTBLOCK
+ return 0, syserror.ERESTART_RESTARTBLOCK
}
return n, err
}
@@ -464,7 +464,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that this means that if err is nil but copyErr is not, copyErr is
// ignored. This is consistent with Linux.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
@@ -494,7 +494,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
@@ -539,7 +539,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index f92bf8096..a892d2c62 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -43,7 +44,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, nil
case linux.PR_GET_PDEATHSIG:
- _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+ _, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal()))
return 0, nil, err
case linux.PR_GET_DUMPABLE:
@@ -110,7 +111,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
buf[len] = 0
len++
}
- _, err := t.CopyOut(addr, buf[:len])
+ _, err := t.CopyOutBytes(addr, buf[:len])
if err != nil {
return 0, nil, err
}
@@ -128,7 +129,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// They trying to set exe to a non-file?
if !fs.IsFile(file.Dirent.Inode.StableAttr) {
@@ -136,7 +137,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
// Set the underlying executable.
- t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
+ t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file))
case linux.PR_SET_MM_AUXV,
linux.PR_SET_MM_START_CODE,
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 071b4bacc..f655d3db1 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -71,7 +71,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
n, err := readv(t, file, dst)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
}
// Readahead implements readahead(2).
@@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
n, err := preadv(t, file, dst, offset)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
}
// Readv implements linux syscall readv(2).
@@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -181,7 +181,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
n, err := readv(t, file, dst)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
}
// Preadv implements linux syscall preadv(2).
@@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -222,7 +222,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
n, err := preadv(t, file, dst, offset)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
}
// Preadv2 implements linux syscall preadv2(2).
@@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -280,12 +280,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if offset == -1 {
n, err := readv(t, file, dst)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
}
n, err := preadv(t, file, dst, offset)
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
}
func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d5d5b6959..309c183a3 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -26,17 +27,13 @@ import (
// rlimit describes an implementation of 'struct rlimit', which may vary from
// system-to-system.
type rlimit interface {
+ marshal.Marshallable
+
// toLimit converts an rlimit to a limits.Limit.
toLimit() *limits.Limit
// fromLimit converts a limits.Limit to an rlimit.
fromLimit(lim limits.Limit)
-
- // copyIn copies an rlimit from the untrusted app to the kernel.
- copyIn(t *kernel.Task, addr usermem.Addr) error
-
- // copyOut copies an rlimit from the kernel to the untrusted app.
- copyOut(t *kernel.Task, addr usermem.Addr) error
}
// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
@@ -50,6 +47,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) {
}
}
+// +marshal
type rlimit64 struct {
Cur uint64
Max uint64
@@ -70,12 +68,12 @@ func (r *rlimit64) fromLimit(lim limits.Limit) {
}
func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
- _, err := t.CopyIn(addr, r)
+ _, err := r.CopyIn(t, addr)
return err
}
func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
- _, err := t.CopyOut(addr, *r)
+ _, err := r.CopyOut(t, addr)
return err
}
@@ -140,7 +138,8 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, err
}
rlim.fromLimit(lim)
- return 0, nil, rlim.copyOut(t, addr)
+ _, err = rlim.CopyOut(t, addr)
+ return 0, nil, err
}
// Setrlimit implements linux syscall setrlimit(2).
@@ -155,7 +154,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if err != nil {
return 0, nil, err
}
- if err := rlim.copyIn(t, addr); err != nil {
+ if _, err := rlim.CopyIn(t, addr); err != nil {
return 0, nil, syserror.EFAULT
}
_, err = prlimit64(t, resource, rlim.toLimit())
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 1674c7445..ac5c98a54 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -80,7 +80,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
ru := getrusage(t, which)
- _, err := t.CopyOut(addr, &ru)
+ _, err := ru.CopyOut(t, addr)
return 0, nil, err
}
@@ -104,7 +104,7 @@ func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
CUTime: linux.ClockTFromDuration(cs2.UserTime),
CSTime: linux.ClockTFromDuration(cs2.SysTime),
}
- if _, err := t.CopyOut(addr, &r); err != nil {
+ if _, err := r.CopyOut(t, addr); err != nil {
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index 99f6993f5..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -27,8 +27,10 @@ const (
)
// SchedParam replicates struct sched_param in sched.h.
+//
+// +marshal
type SchedParam struct {
- schedPriority int64
+ schedPriority int32
}
// SchedGetparam implements linux syscall sched_getparam(2).
@@ -45,7 +47,7 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.ESRCH
}
r := SchedParam{schedPriority: onlyPriority}
- if _, err := t.CopyOut(param, r); err != nil {
+ if _, err := r.CopyOut(t, param); err != nil {
return 0, nil, err
}
@@ -79,7 +81,7 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke
return 0, nil, syserror.ESRCH
}
var r SchedParam
- if _, err := t.CopyIn(param, &r); err != nil {
+ if _, err := r.CopyIn(t, param); err != nil {
return 0, nil, syserror.EINVAL
}
if r.schedPriority != onlyPriority {
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 5b7a66f4d..4fdb4463c 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -24,6 +24,8 @@ import (
)
// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+//
+// +marshal
type userSockFprog struct {
// Len is the length of the filter in BPF instructions.
Len uint16
@@ -33,7 +35,7 @@ type userSockFprog struct {
// Filter is a user pointer to the struct sock_filter array that makes up
// the filter program. Filter is a uint64 rather than a usermem.Addr
// because usermem.Addr is actually uintptr, which is not a fixed-size
- // type, and encoding/binary.Read objects to this.
+ // type.
Filter uint64
}
@@ -54,11 +56,11 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
}
var fprog userSockFprog
- if _, err := t.CopyIn(addr, &fprog); err != nil {
+ if _, err := fprog.CopyIn(t, addr); err != nil {
return err
}
filter := make([]linux.BPFInstruction, int(fprog.Len))
- if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+ if _, err := linux.CopyBPFInstructionSliceIn(t, usermem.Addr(fprog.Filter), filter); err != nil {
return err
}
compiledFilter, err := bpf.Compile(filter)
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 5f54f2456..c2d4bf805 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -66,7 +67,7 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
ops := make([]linux.Sembuf, nsops)
- if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+ if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
return 0, nil, err
}
@@ -116,8 +117,8 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
case linux.IPC_SET:
arg := args[3].Pointer()
- s := linux.SemidDS{}
- if _, err := t.CopyIn(arg, &s); err != nil {
+ var s linux.SemidDS
+ if _, err := s.CopyIn(t, arg); err != nil {
return 0, nil, err
}
@@ -128,9 +129,17 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
v, err := getPID(t, id, num)
return uintptr(v), nil, err
+ case linux.IPC_STAT:
+ arg := args[3].Pointer()
+ ds, err := ipcStat(t, id)
+ if err == nil {
+ _, err = ds.CopyOut(t, arg)
+ }
+
+ return 0, nil, err
+
case linux.IPC_INFO,
linux.SEM_INFO,
- linux.IPC_STAT,
linux.SEM_STAT,
linux.SEM_STAT_ANY,
linux.GETNCNT,
@@ -170,6 +179,16 @@ func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FileP
return set.Change(t, creds, owner, perms)
}
+func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return nil, syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ return set.GetStat(creds)
+}
+
func setVal(t *kernel.Task, id int32, num int32, val int16) error {
r := t.IPCNamespace().SemaphoreRegistry()
set := r.FindByID(id)
@@ -188,7 +207,7 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
return syserror.EINVAL
}
vals := make([]uint16, set.Size())
- if _, err := t.CopyIn(array, vals); err != nil {
+ if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil {
return err
}
creds := auth.CredentialsFromContext(t)
@@ -217,7 +236,7 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
if err != nil {
return err
}
- _, err = t.CopyOut(array, vals)
+ _, err = primitive.CopyUint16SliceOut(t, array, vals)
return err
}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 4a8bc24a2..584064143 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
return uintptr(segment.ID), nil, nil
}
@@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -108,22 +108,22 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
stat, err := segment.IPCStat(t)
if err == nil {
- _, err = t.CopyOut(buf, stat)
+ _, err = stat.CopyOut(t, buf)
}
return 0, nil, err
case linux.IPC_INFO:
params := r.IPCInfo()
- _, err := t.CopyOut(buf, params)
+ _, err := params.CopyOut(t, buf)
return 0, nil, err
case linux.SHM_INFO:
info := r.ShmInfo()
- _, err := t.CopyOut(buf, info)
+ _, err := info.CopyOut(t, buf)
return 0, nil, err
}
@@ -132,20 +132,19 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
switch cmd {
case linux.IPC_SET:
var ds linux.ShmidDS
- _, err = t.CopyIn(buf, &ds)
- if err != nil {
+ if _, err = ds.CopyIn(t, buf); err != nil {
return 0, nil, err
}
- err = segment.Set(t, &ds)
+ err := segment.Set(t, &ds)
return 0, nil, err
case linux.IPC_RMID:
- segment.MarkDestroyed()
+ segment.MarkDestroyed(t)
return 0, nil, nil
case linux.SHM_LOCK, linux.SHM_UNLOCK:
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index d2b0012ae..e748d33d8 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -348,7 +348,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
// Pause implements linux syscall pause(2).
func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+ return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
}
// RtSigpending implements linux syscall rt_sigpending(2).
@@ -496,7 +496,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
t.SetSavedSignalMask(oldmask)
// Perform the wait.
- return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+ return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
}
// RestartSyscall implements the linux syscall restart_syscall(2).
@@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is this a signalfd?
if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
@@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Set appropriate flags.
file.SetFlags(fs.SettableFileFlags{
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 0760af77b..9cd052c3d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -65,10 +67,10 @@ const flagsOffset = 48
const sizeOfInt32 = 4
// messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
// baseRecvFlags are the flags that are accepted across recvmsg(2),
// recvmmsg(2), and recvfrom(2).
@@ -76,6 +78,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
// MessageHeader64 is the 64-bit representation of the msghdr struct used in
// the recvmsg and sendmsg syscalls.
+//
+// +marshal
type MessageHeader64 struct {
// Name is the optional pointer to a network address buffer.
Name uint64
@@ -104,30 +108,14 @@ type MessageHeader64 struct {
// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
// the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
type multipleMessageHeader64 struct {
msgHdr MessageHeader64
msgLen uint32
_ int32
}
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
- b := t.CopyScratchBuffer(52)
- if _, err := t.CopyInBytes(addr, b); err != nil {
- return err
- }
-
- msg.Name = usermem.ByteOrder.Uint64(b[0:])
- msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
- msg.Iov = usermem.ByteOrder.Uint64(b[16:])
- msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
- msg.Control = usermem.ByteOrder.Uint64(b[32:])
- msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
- msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
- return nil
-}
-
// CaptureAddress allocates memory for and copies a socket address structure
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -146,10 +134,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
// writeAddress writes a sockaddr structure and its length to an output buffer
// in the unstrusted address space range. If the address is bigger than the
// buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
// Get the buffer length.
var bufLen uint32
- if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
return err
}
@@ -158,7 +146,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Write the length unconditionally.
- if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
return err
}
@@ -171,7 +159,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Copy as much of the address as will fit in the buffer.
- encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+ addr.MarshalUnsafe(encodedAddr)
if bufLen > uint32(len(encodedAddr)) {
bufLen = uint32(len(encodedAddr))
}
@@ -198,7 +187,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
s.SetFlags(fs.SettableFileFlags{
NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
})
- defer s.DecRef()
+ defer s.DecRef(t)
fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
@@ -233,8 +222,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
s1.SetFlags(fileFlags)
s2.SetFlags(fileFlags)
- defer s1.DecRef()
- defer s2.DecRef()
+ defer s1.DecRef(t)
+ defer s2.DecRef(t)
// Create the FDs for the sockets.
fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
@@ -245,10 +234,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Copy the file descriptors out.
- if _, err := t.CopyOut(socks, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, socks, fds); err != nil {
for _, fd := range fds {
- if file, _ := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ if file, _ := t.FDTable().Remove(t, fd); file != nil {
+ file.DecRef(t)
}
}
return 0, nil, err
@@ -268,7 +257,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -283,7 +272,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
blocking := !file.Flags().NonBlocking
- return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
}
// accept is the implementation of the accept syscall. It is called by accept
@@ -299,7 +288,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -314,7 +303,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
peerRequested := addrLen != 0
nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
if peerRequested {
// NOTE(magi): Linux does not give you an error if it can't
@@ -358,7 +347,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -385,7 +374,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -414,7 +403,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -445,7 +434,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -454,8 +443,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Read the length. Reject negative values.
- optLen := int32(0)
- if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ var optLen int32
+ if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
return 0, nil, err
}
if optLen < 0 {
@@ -469,12 +458,12 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
vLen := int32(binary.Size(v))
- if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
return 0, nil, err
}
if v != nil {
- if _, err := t.CopyOut(optValAddr, v); err != nil {
+ if _, err := v.CopyOut(t, optValAddr); err != nil {
return 0, nil, err
}
}
@@ -484,7 +473,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
-func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
if level == linux.SOL_SOCKET {
switch name {
case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -496,13 +485,16 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr use
switch name {
case linux.SO_TYPE:
_, skType, _ := s.Type()
- return int32(skType), nil
+ v := primitive.Int32(skType)
+ return &v, nil
case linux.SO_DOMAIN:
family, _, _ := s.Type()
- return int32(family), nil
+ v := primitive.Int32(family)
+ return &v, nil
case linux.SO_PROTOCOL:
_, _, protocol := s.Type()
- return int32(protocol), nil
+ v := primitive.Int32(protocol)
+ return &v, nil
}
}
@@ -524,7 +516,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -539,7 +531,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, syserror.EINVAL
}
buf := t.CopyScratchBuffer(int(optLen))
- if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
return 0, nil, err
}
@@ -562,7 +554,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -590,7 +582,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -623,7 +615,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -676,7 +668,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -728,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -743,7 +735,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
// Capture the message header and io vectors.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -766,16 +758,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
if msg.ControlLen == 0 && msg.NameLen == 0 {
n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
if err != nil {
- return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
}
if !cms.Unix.Empty() {
mflags |= linux.MSG_CTRUNC
- cms.Release()
+ cms.Release(t)
}
if int(msg.Flags) != mflags {
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
}
@@ -788,9 +780,9 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
- defer cms.Release()
+ defer cms.Release(t)
controlData := make([]byte, 0, msg.ControlLen)
controlData = control.PackControlMessages(t, cms, controlData)
@@ -812,17 +804,17 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
// Copy the control data to the caller.
- if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
return 0, err
}
if len(controlData) > 0 {
- if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
@@ -846,7 +838,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -875,9 +867,9 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
}
n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
- cm.Release()
+ cm.Release(t)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
// Copy the address to the caller.
@@ -919,7 +911,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -957,7 +949,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -991,7 +983,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -1006,7 +998,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
// Capture the message header.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -1017,7 +1009,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
return 0, syserror.ENOBUFS
}
controlData = make([]byte, msg.ControlLen)
- if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
@@ -1059,9 +1051,11 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
- err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
- if err != nil {
- controlMessages.Release()
+ err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
+ controlMessages.Release(t)
}
return uintptr(n), err
}
@@ -1079,7 +1073,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -1119,7 +1113,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
- return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+ return uintptr(n), handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
}
// SendTo implements the linux syscall sendto(2).
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 77c78889d..46616c961 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -101,7 +102,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
if !inFile.Flags().Read {
return 0, nil, syserror.EBADF
@@ -111,7 +112,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
if !outFile.Flags().Write {
return 0, nil, syserror.EBADF
@@ -141,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// Copy in the offset.
var offset int64
- if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, offsetAddr, &offset); err != nil {
return 0, nil, err
}
@@ -149,11 +150,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
Length: count,
SrcOffset: true,
- SrcStart: offset,
+ SrcStart: int64(offset),
}, outFile.Flags().NonBlocking)
// Copy out the new offset.
- if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, offsetAddr, offset+n); err != nil {
return 0, nil, err
}
} else {
@@ -170,7 +171,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// We can only pass a single file to handleIOError, so pick inFile
// arbitrarily. This is used only for debugging purposes.
- return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile)
+ return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "sendfile", inFile)
}
// Splice implements splice(2).
@@ -192,13 +193,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
// The operation is non-blocking if anything is non-blocking.
//
@@ -228,7 +229,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
var offset int64
- if _, err := t.CopyIn(outOffset, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, outOffset, &offset); err != nil {
return 0, nil, err
}
@@ -246,7 +247,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
var offset int64
- if _, err := t.CopyIn(inOffset, &offset); err != nil {
+ if _, err := primitive.CopyInt64In(t, inOffset, &offset); err != nil {
return 0, nil, err
}
@@ -280,7 +281,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
// See above; inFile is chosen arbitrarily here.
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "splice", inFile)
}
// Tee imlements tee(2).
@@ -300,13 +301,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
// All files must be pipes.
if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) {
@@ -333,5 +334,5 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
}
// See above; inFile is chosen arbitrarily here.
- return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile)
+ return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "tee", inFile)
}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 46ebf27a2..cda29a8b5 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, fstat(t, file, statAddr)
}
@@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, fstat(t, file, statAddr)
}
@@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
uattr, err := file.UnstableAttr(t)
if err != nil {
return 0, nil, err
@@ -221,7 +221,7 @@ func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr
DevMajor: uint32(devMajor),
DevMinor: devMinor,
}
- _, err := t.CopyOut(statxAddr, &s)
+ _, err := s.CopyOut(t, statxAddr)
return err
}
@@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
}
@@ -283,7 +283,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
FragmentSize: d.Inode.StableAttr.BlockSize,
// Leave other fields 0 like simple_statfs does.
}
- _, err = t.CopyOut(addr, &statfs)
+ _, err = statfs.CopyOut(t, addr)
return err
}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 5ad465ae3..048a21c6e 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Use "sync-the-world" for now, it's guaranteed that fd is at least
// on the root filesystem.
@@ -54,10 +54,10 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
- return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
// Fdatasync implements linux syscall fdatasync(2).
@@ -70,10 +70,10 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
- return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
// SyncFileRange implements linux syscall sync_file_rage(2)
@@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the
// specified range that have already been submitted to the device
@@ -135,7 +135,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData)
}
- return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}
// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 297de052a..db3d924d9 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -21,13 +21,17 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
)
-// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+// Sysinfo implements Linux syscall sysinfo(2).
func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mf := t.Kernel().MemoryFile()
- mf.UpdateUsage()
- _, totalUsage := usage.MemoryAccounting.Copy()
+ mfUsage, err := mf.TotalUsage()
+ if err != nil {
+ return 0, nil, err
+ }
+ memStats, _ := usage.MemoryAccounting.Copy()
+ totalUsage := mfUsage + memStats.Mapped
totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
memFree := totalSize - totalUsage
if memFree > totalSize {
@@ -37,12 +41,12 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
// Only a subset of the fields in sysinfo_t make sense to return.
si := linux.Sysinfo{
- Procs: uint16(len(t.PIDNamespace().Tasks())),
+ Procs: uint16(t.Kernel().TaskSet().Root.NumTasks()),
Uptime: t.Kernel().MonotonicClock().Now().Seconds(),
TotalRAM: totalSize,
FreeRAM: memFree,
Unit: 1,
}
- _, err := t.CopyOut(addr, si)
+ _, err = si.CopyOut(t, addr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 00915fdde..39ca9ea97 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -19,6 +19,7 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -117,7 +118,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
var wd *fs.Dirent
var executable fsbridge.File
@@ -133,7 +134,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
closeOnExec = fdFlags.CloseOnExec
if atEmptyPath && len(pathname) == 0 {
@@ -155,7 +156,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
}
}
if wd != nil {
- defer wd.DecRef()
+ defer wd.DecRef(t)
}
// Load the new TaskContext.
@@ -262,7 +263,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
wopts.Events |= kernel.EventGroupContinue
}
if options&linux.WNOHANG == 0 {
- wopts.BlockInterruptErr = kernel.ERESTARTSYS
+ wopts.BlockInterruptErr = syserror.ERESTARTSYS
}
if options&linux.WNOTHREAD == 0 {
wopts.SiblingChildren = true
@@ -311,13 +312,13 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
return 0, err
}
if statusAddr != 0 {
- if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+ if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil {
return 0, err
}
}
if rusageAddr != 0 {
ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
- if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ if _, err := ru.CopyOut(t, rusageAddr); err != nil {
return 0, err
}
}
@@ -395,14 +396,14 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// as well.
if infop != 0 {
var si arch.SignalInfo
- _, err = t.CopyOut(infop, &si)
+ _, err = si.CopyOut(t, infop)
}
}
return 0, nil, err
}
if rusageAddr != 0 {
ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
- if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+ if _, err := ru.CopyOut(t, rusageAddr); err != nil {
return 0, nil, err
}
}
@@ -441,7 +442,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
default:
t.Warningf("waitid got incomprehensible wait status %d", s)
}
- _, err = t.CopyOut(infop, &si)
+ _, err = si.CopyOut(t, infop)
return 0, nil, err
}
@@ -558,9 +559,7 @@ func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// third argument to this system call is nowadays unused.
if cpu != 0 {
- buf := t.CopyScratchBuffer(4)
- usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
- if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+ if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
return 0, nil, err
}
}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 2d2aa0819..c5054d2f1 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -19,6 +19,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -168,7 +169,7 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
return uintptr(r), nil, nil
}
- if _, err := t.CopyOut(addr, r); err != nil {
+ if _, err := r.CopyOut(t, addr); err != nil {
return 0, nil, err
}
return uintptr(r), nil, nil
@@ -213,7 +214,7 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error
return nil
}
- return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+ return syserror.ConvertIntr(err, syserror.ERESTARTNOHAND)
}
// clockNanosleepFor blocks for a specified duration.
@@ -254,7 +255,7 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem use
duration: remaining,
rem: rem,
})
- return kernel.ERESTART_RESTARTBLOCK
+ return syserror.ERESTART_RESTARTBLOCK
default:
panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
}
@@ -334,8 +335,8 @@ func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
// Ask the time package for the timezone.
_, offset := time.Now().Zone()
// This int32 array mimics linux's struct timezone.
- timezone := [2]int32{-int32(offset) / 60, 0}
- _, err := t.CopyOut(tz, timezone)
+ timezone := []int32{-int32(offset) / 60, 0}
+ _, err := primitive.CopyInt32SliceOut(t, tz, timezone)
return 0, nil, err
}
return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index a4c400f87..45eef4feb 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -21,81 +21,63 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
const nsecPerSec = int64(time.Second)
-// copyItimerValIn copies an ItimerVal from the untrusted app range to the
-// kernel. The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed because because Linux allows
-// setitimer(which, NULL, &old_value) which disables the timer.
-// There is a KERN_WARN message saying this misfeature will be removed.
-// However, that hasn't happened as of 3.19, so we continue to support it.
-func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
- if addr == usermem.Addr(0) {
- return linux.ItimerVal{}, nil
- }
-
- switch t.Arch().Width() {
- case 8:
- // Native size, just copy directly.
- var itv linux.ItimerVal
- if _, err := t.CopyIn(addr, &itv); err != nil {
- return linux.ItimerVal{}, err
- }
-
- return itv, nil
- default:
- return linux.ItimerVal{}, syserror.ENOSYS
- }
-}
-
-// copyItimerValOut copies an ItimerVal to the untrusted app range.
-// The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed, in which case no copy takes place
-func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
- if addr == usermem.Addr(0) {
- return nil
- }
-
- switch t.Arch().Width() {
- case 8:
- // Native size, just copy directly.
- _, err := t.CopyOut(addr, itv)
- return err
- default:
- return syserror.ENOSYS
- }
-}
-
// Getitimer implements linux syscall getitimer(2).
func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ if t.Arch().Width() != 8 {
+ // Definition of linux.ItimerVal assumes 64-bit architecture.
+ return 0, nil, syserror.ENOSYS
+ }
+
timerID := args[0].Int()
- val := args[1].Pointer()
+ addr := args[1].Pointer()
olditv, err := t.Getitimer(timerID)
if err != nil {
return 0, nil, err
}
- return 0, nil, copyItimerValOut(t, val, &olditv)
+ // A NULL address is allowed, in which case no copy out takes place.
+ if addr == 0 {
+ return 0, nil, nil
+ }
+ _, err = olditv.CopyOut(t, addr)
+ return 0, nil, err
}
// Setitimer implements linux syscall setitimer(2).
func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- timerID := args[0].Int()
- newVal := args[1].Pointer()
- oldVal := args[2].Pointer()
+ if t.Arch().Width() != 8 {
+ // Definition of linux.ItimerVal assumes 64-bit architecture.
+ return 0, nil, syserror.ENOSYS
+ }
- newitv, err := copyItimerValIn(t, newVal)
- if err != nil {
- return 0, nil, err
+ timerID := args[0].Int()
+ newAddr := args[1].Pointer()
+ oldAddr := args[2].Pointer()
+
+ var newitv linux.ItimerVal
+ // A NULL address is allowed because because Linux allows
+ // setitimer(which, NULL, &old_value) which disables the timer. There is a
+ // KERN_WARN message saying this misfeature will be removed. However, that
+ // hasn't happened as of 3.19, so we continue to support it.
+ if newAddr != 0 {
+ if _, err := newitv.CopyIn(t, newAddr); err != nil {
+ return 0, nil, err
+ }
}
olditv, err := t.Setitimer(timerID, newitv)
if err != nil {
return 0, nil, err
}
- return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+ // A NULL address is allowed, in which case no copy out takes place.
+ if oldAddr == 0 {
+ return 0, nil, nil
+ }
+ _, err = olditv.CopyOut(t, oldAddr)
+ return 0, nil, err
}
// Alarm implements linux syscall alarm(2).
@@ -131,7 +113,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
var sev *linux.Sigevent
if sevp != 0 {
sev = &linux.Sigevent{}
- if _, err = t.CopyIn(sevp, sev); err != nil {
+ if _, err = sev.CopyIn(t, sevp); err != nil {
return 0, nil, err
}
}
@@ -141,7 +123,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, err
}
- if _, err := t.CopyOut(timerIDp, &id); err != nil {
+ if _, err := id.CopyOut(t, timerIDp); err != nil {
t.IntervalTimerDelete(id)
return 0, nil, err
}
@@ -157,7 +139,7 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
oldValAddr := args[3].Pointer()
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
@@ -165,9 +147,8 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
return 0, nil, err
}
if oldValAddr != 0 {
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
- return 0, nil, err
- }
+ _, err = oldVal.CopyOut(t, oldValAddr)
+ return 0, nil, err
}
return 0, nil, nil
}
@@ -181,7 +162,7 @@ func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if err != nil {
return 0, nil, err
}
- _, err = t.CopyOut(curValAddr, &curVal)
+ _, err = curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index cf49b43db..cadd9d348 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.EINVAL
}
f := timerfd.NewFile(t, c)
- defer f.DecRef()
+ defer f.DecRef(t)
f.SetFlags(fs.SettableFileFlags{
NonBlocking: flags&linux.TFD_NONBLOCK != 0,
})
@@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
tf, ok := f.FileOperations.(*timerfd.TimerOperations)
if !ok {
@@ -81,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
}
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
@@ -91,7 +91,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, oldS := tf.SetTime(newS)
if oldValAddr != 0 {
oldVal := ktime.ItimerspecFromSetting(tm, oldS)
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
return 0, nil, err
}
}
@@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
tf, ok := f.FileOperations.(*timerfd.TimerOperations)
if !ok {
@@ -116,6 +116,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, s := tf.GetTime()
curVal := ktime.ItimerspecFromSetting(tm, s)
- _, err := t.CopyOut(curValAddr, &curVal)
+ _, err := curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
index b3eb96a1c..6ddd30d5c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -18,6 +18,7 @@ package linux
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
@@ -30,17 +31,19 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
case linux.ARCH_GET_FS:
addr := args[1].Pointer()
fsbase := t.Arch().TLS()
- _, err := t.CopyOut(addr, uint64(fsbase))
- if err != nil {
- return 0, nil, err
+ switch t.Arch().Width() {
+ case 8:
+ if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil {
+ return 0, nil, err
+ }
+ default:
+ return 0, nil, syserror.ENOSYS
}
-
case linux.ARCH_SET_FS:
fsbase := args[1].Uint64()
if !t.Arch().SetTLS(uintptr(fsbase)) {
return 0, nil, syserror.EPERM
}
-
case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index e9d702e8e..66c5974f5 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -46,7 +46,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Copy out the result.
va := args[0].Pointer()
- _, err := t.CopyOut(va, u)
+ _, err := u.CopyOut(t, va)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 6ec0de96e..95bfe6606 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is writable.
if !file.Flags().Write {
@@ -71,7 +71,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
n, err := writev(t, file, src)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
}
// Pwrite64 implements linux syscall pwrite64(2).
@@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -118,7 +118,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
n, err := pwritev(t, file, src, offset)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
}
// Writev implements linux syscall writev(2).
@@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is writable.
if !file.Flags().Write {
@@ -148,7 +148,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
n, err := writev(t, file, src)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
}
// Pwritev implements linux syscall pwritev(2).
@@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -189,7 +189,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
n, err := pwritev(t, file, src, offset)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
}
// Pwritev2 implements linux syscall pwritev2(2).
@@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -250,12 +250,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if offset == -1 {
n, err := writev(t, file, src)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
}
n, err := pwritev(t, file, src, offset)
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+ return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
}
func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index c24946160..97474fd3c 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
if err != nil {
@@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
}
@@ -270,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
n, err := listXattr(t, f.Dirent, listAddr, size)
if err != nil {
@@ -384,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
return 0, nil, removeXattr(t, f.Dirent, nameAddr)
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 9c8b44f64..9ee766552 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
go_library(
name = "vfs2",
srcs = [
+ "aio.go",
"epoll.go",
"eventfd.go",
"execve.go",
@@ -14,8 +15,10 @@ go_library(
"getdents.go",
"inotify.go",
"ioctl.go",
+ "lock.go",
"memfd.go",
"mmap.go",
+ "mount.go",
"path.go",
"pipe.go",
"poll.go",
@@ -38,9 +41,14 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/bits",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/gohacks",
+ "//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsbridge",
"//pkg/sentry/fsimpl/eventfd",
"//pkg/sentry/fsimpl/pipefs",
@@ -49,11 +57,13 @@ go_library(
"//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/fasync",
"//pkg/sentry/kernel/pipe",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/loader",
"//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
"//pkg/sentry/socket",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
new file mode 100644
index 000000000..6d0a38330
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -0,0 +1,226 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+ nrEvents := args[1].Int()
+ addr := args[2].Pointer()
+
+ if nrEvents < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ for i := int32(0); i < nrEvents; i++ {
+ // Copy in the callback address.
+ var cbAddr usermem.Addr
+ switch t.Arch().Width() {
+ case 8:
+ var cbAddrP primitive.Uint64
+ if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+ cbAddr = usermem.Addr(cbAddrP)
+ default:
+ return 0, nil, syserror.ENOSYS
+ }
+
+ // Copy in this callback.
+ var cb linux.IOCallback
+ if _, err := cb.CopyIn(t, cbAddr); err != nil {
+ if i > 0 {
+ // Some have been successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Process this callback.
+ if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+ if i > 0 {
+ // Partial success.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Advance to the next one.
+ addr += usermem.Addr(t.Arch().Width())
+ }
+
+ return uintptr(nrEvents), nil, nil
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+ if cb.Reserved2 != 0 {
+ return syserror.EINVAL
+ }
+
+ fd := t.GetFileVFS2(cb.FD)
+ if fd == nil {
+ return syserror.EBADF
+ }
+ defer fd.DecRef(t)
+
+ // Was there an eventFD? Extract it.
+ var eventFD *vfs.FileDescription
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+ eventFD = t.GetFileVFS2(cb.ResFD)
+ if eventFD == nil {
+ return syserror.EBADF
+ }
+ defer eventFD.DecRef(t)
+
+ // Check that it is an eventfd.
+ if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
+ return syserror.EINVAL
+ }
+ }
+
+ ioseq, err := memoryFor(t, cb)
+ if err != nil {
+ return err
+ }
+
+ // Check offset for reads/writes.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ if cb.Offset < 0 {
+ return syserror.EINVAL
+ }
+ }
+
+ // Prepare the request.
+ aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ready := aioCtx.Prepare(); !ready {
+ // Context is busy.
+ return syserror.EAGAIN
+ }
+
+ if eventFD != nil {
+ // The request is set. Make sure there's a ref on the file.
+ //
+ // This is necessary when the callback executes on completion,
+ // which is also what will release this reference.
+ eventFD.IncRef()
+ }
+
+ // Perform the request asynchronously.
+ fd.IncRef()
+ t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
+ return nil
+}
+
+func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ // Release references after completing the callback.
+ defer fd.DecRef(ctx)
+ if eventFD != nil {
+ defer eventFD.DecRef(ctx)
+ }
+
+ if aioCtx.Dead() {
+ aioCtx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
+
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
+ err = fd.Sync(ctx)
+ }
+
+ // Update the result.
+ if err != nil {
+ err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
+
+ // Queue the result for delivery.
+ aioCtx.FinishRequest(ev)
+
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFD != nil {
+ eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
+ }
+ }
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+ bytes := int(cb.Bytes)
+ if bytes < 0 {
+ // Linux also requires that this field fit in ssize_t.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+
+ // Since this I/O will be asynchronous with respect to t's task goroutine,
+ // we have no guarantee that t's AddressSpace will be active during the
+ // I/O.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+ return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+ return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+ return usermem.IOSequence{}, nil
+
+ default:
+ // Not a supported command.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index 34c90ae3e..d0cbb77eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -24,7 +24,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -37,11 +36,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
return 0, nil, syserror.EINVAL
}
- file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
@@ -62,11 +61,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, syserror.EINVAL
}
- file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
@@ -86,7 +85,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if epfile == nil {
return 0, nil, syserror.EBADF
}
- defer epfile.DecRef()
+ defer epfile.DecRef(t)
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
@@ -95,7 +94,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
if epfile == file {
return 0, nil, syserror.EINVAL
}
@@ -135,56 +134,32 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if epfile == nil {
return 0, nil, syserror.EBADF
}
- defer epfile.DecRef()
+ defer epfile.DecRef(t)
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
}
- // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
- // maxEvents), so that the buffer can be allocated on the stack.
+ // Allocate space for a few events on the stack for the common case in
+ // which we don't have too many events.
var (
- events [16]linux.EpollEvent
- total int
+ eventsArr [16]linux.EpollEvent
ch chan struct{}
haveDeadline bool
deadline ktime.Time
)
for {
- batchEvents := len(events)
- if batchEvents > maxEvents {
- batchEvents = maxEvents
- }
- n := ep.ReadEvents(events[:batchEvents])
- maxEvents -= n
- if n != 0 {
- // Copy what we read out.
- copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+ events := ep.ReadEvents(eventsArr[:0], maxEvents)
+ if len(events) != 0 {
+ copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
- eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
- total += copiedEvents
- if err != nil {
- if total != 0 {
- return uintptr(total), nil, nil
- }
- return 0, nil, err
- }
- // If we've filled the application's event buffer, we're done.
- if maxEvents == 0 {
- return uintptr(total), nil, nil
- }
- // Loop if we read a full batch, under the expectation that there
- // may be more events to read.
- if n == batchEvents {
- continue
+ if copiedEvents != 0 {
+ return uintptr(copiedEvents), nil, nil
}
+ return 0, nil, err
}
- // We get here if n != batchEvents. If we read any number of events
- // (just now, or in a previous iteration of this loop), or if timeout
- // is 0 (such that epoll_wait should be non-blocking), return the
- // events we've read so far to the application.
- if total != 0 || timeout == 0 {
- return uintptr(total), nil, nil
+ if timeout == 0 {
+ return 0, nil, nil
}
// In the first iteration of this loop, register with the epoll
// instance for readability events, but then immediately continue the
@@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if err == syserror.ETIMEDOUT {
err = nil
}
- // total must be 0 since otherwise we would have returned
- // above.
return 0, nil, err
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
index aff1a2070..807f909da 100644
--- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
@@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
fileFlags |= linux.O_NONBLOCK
}
semMode := flags&linux.EFD_SEMAPHORE != 0
- eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags)
+ eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags)
if err != nil {
return 0, nil, err
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(t)
fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
index aef0078a8..c8ce2aabc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/execve.go
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
var executable fsbridge.File
closeOnExec := false
if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
@@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
}
start := dirfile.VirtualDentry()
start.IncRef()
- dirfile.DecRef()
+ dirfile.DecRef(t)
closeOnExec = dirfileFlags.CloseOnExec
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
@@ -101,19 +101,18 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
Flags: linux.O_RDONLY,
FileExec: true,
})
- start.DecRef()
+ start.DecRef(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
executable = fsbridge.NewVFSFile(file)
}
// Load the new TaskContext.
- mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
- defer mntns.DecRef()
+ mntns := t.MountNamespaceVFS2()
wd := t.FSContext().WorkingDirectoryVFS2()
- defer wd.DecRef()
+ defer wd.DecRef(t)
remainingTraversals := uint(linux.MaxSymlinkTraversals)
loadArgs := loader.LoadArgs{
Opener: fsbridge.NewVFSLookup(mntns, root, wd),
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index ca0f7fd1e..36e89700e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -17,10 +17,13 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,11 +34,11 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- _, file := t.FDTable().Remove(fd)
+ _, file := t.FDTable().Remove(t, fd)
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.OnClose(t)
return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
@@ -49,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
@@ -69,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
return uintptr(newfd), nil, nil
}
@@ -98,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -118,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
@@ -134,14 +137,49 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
- return 0, nil, nil
+ return 0, nil, err
case linux.F_GETFL:
return uintptr(file.StatusFlags()), nil, nil
case linux.F_SETFL:
return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+ case linux.F_GETOWN:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ if owner.Type == linux.F_OWNER_PGRP {
+ return uintptr(-owner.PID), nil, nil
+ }
+ return uintptr(owner.PID), nil, nil
+ case linux.F_SETOWN:
+ who := args[2].Int()
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ case linux.F_GETOWN_EX:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ _, err := owner.CopyOut(t, args[2].Pointer())
+ return 0, nil, err
+ case linux.F_SETOWN_EX:
+ var owner linux.FOwnerEx
+ _, err := owner.CopyIn(t, args[2].Pointer())
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
case linux.F_SETPIPE_SZ:
pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
if !ok {
@@ -167,8 +205,151 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
err := tmpfs.AddSeals(file, args[2].Uint())
return 0, nil, err
+ case linux.F_SETLK, linux.F_SETLKW:
+ return 0, nil, posixLock(t, args, file, cmd)
default:
- // TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+ // Everything else is not yet supported.
return 0, nil, syserror.EINVAL
}
}
+
+func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
+ a := fd.AsyncHandler()
+ if a == nil {
+ return linux.FOwnerEx{}, false
+ }
+
+ ot, otg, opg := a.(*fasync.FileAsync).Owner()
+ switch {
+ case ot != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_TID,
+ PID: int32(t.PIDNamespace().IDOfTask(ot)),
+ }, true
+ case otg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PID,
+ PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+ }, true
+ case opg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PGRP,
+ PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+ }, true
+ default:
+ return linux.FOwnerEx{}, true
+ }
+}
+
+func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+ switch ownerType {
+ case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
+ // Acceptable type.
+ default:
+ return syserror.EINVAL
+ }
+
+ a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+ if pid == 0 {
+ a.ClearOwner()
+ return nil
+ }
+
+ switch ownerType {
+ case linux.F_OWNER_TID:
+ task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
+ if task == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerTask(t, task)
+ return nil
+ case linux.F_OWNER_PID:
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+ if tg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerThreadGroup(t, tg)
+ return nil
+ case linux.F_OWNER_PGRP:
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
+ if pg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerProcessGroup(t, pg)
+ return nil
+ default:
+ return syserror.EINVAL
+ }
+}
+
+func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error {
+ // Copy in the lock request.
+ flockAddr := args[2].Pointer()
+ var flock linux.Flock
+ if _, err := flock.CopyIn(t, flockAddr); err != nil {
+ return err
+ }
+
+ var blocker lock.Blocker
+ if cmd == linux.F_SETLKW {
+ blocker = t
+ }
+
+ switch flock.Type {
+ case linux.F_RDLCK:
+ if !file.IsReadable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_WRLCK:
+ if !file.IsWritable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_UNLCK:
+ return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence)
+
+ default:
+ return syserror.EINVAL
+ }
+}
+
+// Fadvise64 implements fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[2].Int64()
+ advice := args[3].Int()
+
+ // Note: offset is allowed to be negative.
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ // If the FD refers to a pipe or FIFO, return error.
+ if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ switch advice {
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Sure, whatever.
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index 46d3e189c..01e0f9010 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -56,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i
if err != nil {
return err
}
- defer oldtpop.Release()
+ defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
@@ -66,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i
if err != nil {
return err
}
- defer newtpop.Release()
+ defer newtpop.Release(t)
return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
}
@@ -95,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
@@ -106,7 +106,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
addr := args[0].Pointer()
mode := args[1].ModeT()
dev := args[2].Uint()
- return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+ return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
}
// Mknodat implements Linux syscall mknodat(2).
@@ -115,10 +115,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
addr := args[1].Pointer()
mode := args[2].ModeT()
dev := args[3].Uint()
- return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+ return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
}
-func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
@@ -127,10 +127,15 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
+
+ // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+ if mode.FileType() == 0 {
+ mode |= linux.ModeRegular
+ }
major, minor := linux.DecodeDeviceID(dev)
return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
- Mode: linux.FileMode(mode &^ t.FSContext().Umask()),
+ Mode: mode &^ linux.FileMode(t.FSContext().Umask()),
DevMajor: uint32(major),
DevMinor: minor,
})
@@ -169,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
Flags: flags | linux.O_LARGEFILE,
@@ -178,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -222,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd
if err != nil {
return err
}
- defer oldtpop.Release()
+ defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
@@ -232,7 +237,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd
if err != nil {
return err
}
- defer newtpop.Release()
+ defer newtpop.Release(t)
return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
Flags: flags,
@@ -254,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
}
@@ -273,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
}
@@ -313,6 +318,9 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath
if err != nil {
return err
}
+ if len(target) == 0 {
+ return syserror.ENOENT
+ }
linkpath, err := copyInPath(t, linkpathAddr)
if err != nil {
return err
@@ -321,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
index 317409a18..a7d4d2a36 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
root := t.FSContext().RootDirectoryVFS2()
wd := t.FSContext().WorkingDirectoryVFS2()
s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
- root.DecRef()
- wd.DecRef()
+ root.DecRef(t)
+ wd.DecRef(t)
if err != nil {
return 0, nil, err
}
@@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- t.FSContext().SetWorkingDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetWorkingDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
@@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- t.FSContext().SetWorkingDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetWorkingDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
@@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- t.FSContext().SetRootDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetRootDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index c7c7bf7ce..5517595b5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
cb := getGetdentsCallback(t, addr, size, isGetdents64)
err := file.IterDirents(t, cb)
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
index 7d50b6a16..11753d8e5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/inotify.go
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -35,7 +35,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if err != nil {
return 0, nil, err
}
- defer ino.DecRef()
+ defer ino.DecRef(t)
fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
CloseOnExec: flags&linux.IN_CLOEXEC != 0,
@@ -66,7 +66,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription,
ino, ok := f.Impl().(*vfs.Inotify)
if !ok {
// Not an inotify fd.
- f.DecRef()
+ f.DecRef(t)
return nil, nil, syserror.EINVAL
}
@@ -81,7 +81,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
// "EINVAL: The given event mask contains no valid events."
// -- inotify_add_watch(2)
- if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+ if mask&linux.ALL_INOTIFY_BITS == 0 {
return 0, nil, syserror.EINVAL
}
@@ -96,7 +96,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
if err != nil {
return 0, nil, err
}
- defer f.DecRef()
+ defer f.DecRef(t)
path, err := copyInPath(t, addr)
if err != nil {
@@ -109,15 +109,18 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
if err != nil {
return 0, nil, err
}
- defer d.DecRef()
+ defer d.DecRef(t)
- fd = ino.AddWatch(d.Dentry(), mask)
- return uintptr(fd), nil, err
+ fd, err = ino.AddWatch(d.Dentry(), mask)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
}
// InotifyRmWatch implements the inotify_rm_watch() syscall.
@@ -129,6 +132,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if err != nil {
return 0, nil, err
}
- defer f.DecRef()
- return 0, nil, ino.RmWatch(wd)
+ defer f.DecRef(t)
+ return 0, nil, ino.RmWatch(t, wd)
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 5a2418da9..2806c3f6f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -15,6 +15,8 @@
package vfs2
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
@@ -28,7 +30,78 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
+
+ // Handle ioctls that apply to all FDs.
+ switch args[1].Int() {
+ case linux.FIONCLEX:
+ t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
+ CloseOnExec: false,
+ })
+ return 0, nil, nil
+
+ case linux.FIOCLEX:
+ t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
+ CloseOnExec: true,
+ })
+ return 0, nil, nil
+
+ case linux.FIONBIO:
+ var set int32
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_NONBLOCK
+ } else {
+ flags &^= linux.O_NONBLOCK
+ }
+ return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
+
+ case linux.FIOASYNC:
+ var set int32
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_ASYNC
+ } else {
+ flags &^= linux.O_ASYNC
+ }
+ file.SetStatusFlags(t, t.Credentials(), flags)
+ return 0, nil, nil
+
+ case linux.FIOGETOWN, linux.SIOCGPGRP:
+ var who int32
+ owner, hasOwner := getAsyncOwner(t, file)
+ if hasOwner {
+ if owner.Type == linux.F_OWNER_PGRP {
+ who = -owner.PID
+ } else {
+ who = owner.PID
+ }
+ }
+ _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
+ return 0, nil, err
+
+ case linux.FIOSETOWN, linux.SIOCSPGRP:
+ var who int32
+ if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
+ return 0, nil, err
+ }
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ }
ret, err := file.Ioctl(t, t.MemoryManager(), args)
return ret, nil, err
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..b910b5a74
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ operation := args[1].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ // flock(2): EBADF fd is not an open file descriptor.
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ nonblocking := operation&linux.LOCK_NB != 0
+ operation &^= linux.LOCK_NB
+
+ var blocker lock.Blocker
+ if !nonblocking {
+ blocker = t
+ }
+
+ switch operation {
+ case linux.LOCK_EX:
+ if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_SH:
+ if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_UN:
+ if err := file.UnlockBSD(t); err != nil {
+ return 0, nil, err
+ }
+ default:
+ // flock(2): EINVAL operation is invalid.
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
index bbe248d17..c4c0f9e0a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/memfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -47,10 +47,11 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
shmMount := t.Kernel().ShmMount()
- file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+ file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
if err != nil {
return 0, nil, err
}
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index 60a43f0a0..9d9dbf775 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -17,6 +17,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/syserror"
@@ -61,7 +62,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
}
defer func() {
if opts.MappingIdentity != nil {
- opts.MappingIdentity.DecRef()
+ opts.MappingIdentity.DecRef(t)
}
}()
@@ -71,7 +72,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// mmap unconditionally requires that the FD is readable.
if !file.IsReadable() {
@@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with an anonymous tmpfs file.
+ opts.Offset = 0
+ file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef(t)
+ if err := file.ConfigureMMap(t, &opts); err != nil {
+ return 0, nil, err
+ }
}
rv, err := t.MemoryManager().MMap(t, opts)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..769c9b92f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,150 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sourceAddr := args[0].Pointer()
+ targetAddr := args[1].Pointer()
+ typeAddr := args[2].Pointer()
+ flags := args[3].Uint64()
+ dataAddr := args[4].Pointer()
+
+ // For null-terminated strings related to mount(2), Linux copies in at most
+ // a page worth of data. See fs/namespace.c:copy_mount_string().
+ fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ targetPath, err := copyInPath(t, targetAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ data := ""
+ if dataAddr != 0 {
+ // In Linux, a full page is always copied in regardless of null
+ // character placement, and the address is passed to each file system.
+ // Most file systems always treat this data as a string, though, and so
+ // do all of the ones we implement.
+ data, err = t.CopyInString(dataAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ // Ignore magic value that was required before Linux 2.4.
+ if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+ flags = flags &^ linux.MS_MGC_MSK
+ }
+
+ // Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+ // namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+ linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+ linux.MS_UNBINDABLE | linux.MS_MOVE
+
+ // Silently allow MS_NOSUID, since we don't implement set-id bits
+ // anyway.
+ const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+ // Linux just allows passing any flags to mount(2) - it won't fail when
+ // unknown or unsupported flags are passed. Since we don't implement
+ // everything, we fail explicitly on flags that are unimplemented.
+ if flags&(unsupportedOps|unsupportedFlags) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var opts vfs.MountOptions
+ if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+ opts.Flags.NoATime = true
+ }
+ if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+ opts.Flags.NoExec = true
+ }
+ if flags&linux.MS_NODEV == linux.MS_NODEV {
+ opts.Flags.NoDev = true
+ }
+ if flags&linux.MS_NOSUID == linux.MS_NOSUID {
+ opts.Flags.NoSUID = true
+ }
+ if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+ opts.ReadOnly = true
+ }
+ opts.GetFilesystemOptions.Data = data
+
+ target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer target.Release(t)
+ _, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+ return 0, nil, err
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Int()
+
+ // Must have CAP_SYS_ADMIN in the mount namespace's associated user
+ // namespace.
+ //
+ // Currently, this is always the init task's user namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+ if flags&unsupported != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release(t)
+
+ opts := vfs.UmountOptions{
+ Flags: uint32(flags),
+ }
+
+ return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
index 97da6c647..90a511d9a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/path.go
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
haveStartRef := false
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
- root.DecRef()
+ root.DecRef(t)
return taskPathOperation{}, syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
@@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
- root.DecRef()
+ root.DecRef(t)
return taskPathOperation{}, syserror.EBADF
}
start = dirfile.VirtualDentry()
start.IncRef()
haveStartRef = true
- dirfile.DecRef()
+ dirfile.DecRef(t)
}
}
return taskPathOperation{
@@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
}, nil
}
-func (tpop *taskPathOperation) Release() {
- tpop.pop.Root.DecRef()
+func (tpop *taskPathOperation) Release(t *kernel.Task) {
+ tpop.pop.Root.DecRef(t)
if tpop.haveStartRef {
- tpop.pop.Start.DecRef()
+ tpop.pop.Start.DecRef(t)
tpop.haveStartRef = false
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 4a01e4209..ee38fdca0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -16,6 +16,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -42,8 +43,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
return syserror.EINVAL
}
r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(t)
+ defer w.DecRef(t)
fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -51,10 +52,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
if err != nil {
return err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if _, file := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ if _, file := t.FDTable().Remove(t, fd); file != nil {
+ file.DecRef(t)
}
}
return err
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index ff1b25d7b..c22e4ce54 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
if ch == nil {
- defer file.DecRef()
+ defer file.DecRef(t)
} else {
state.file = file
state.waiter, _ = waiter.NewChannelEntry(ch)
@@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
// releaseState releases all the pollState in "state".
-func releaseState(state []pollState) {
+func releaseState(t *kernel.Task, state []pollState) {
for i := range state {
if state[i].file != nil {
state[i].file.EventUnregister(&state[i].waiter)
- state[i].file.DecRef()
+ state[i].file.DecRef(t)
}
}
}
@@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
// result, we stop registering for events but still go through all files
// to get their ready masks.
state := make([]pollState, len(pfd))
- defer releaseState(state)
+ defer releaseState(t, state)
n := uintptr(0)
for i := range pfd {
initReadiness(t, &pfd[i], &state[i], ch)
@@ -165,7 +165,7 @@ func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
pfd := make([]linux.PollFD, nfds)
if nfds > 0 {
- if _, err := t.CopyIn(addr, &pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
return nil, err
}
}
@@ -192,7 +192,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
- if _, err := t.CopyOut(addr, pfd); err != nil {
+ if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
@@ -205,7 +205,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
set := make([]byte, nBytes)
if addr != 0 {
- if _, err := t.CopyIn(addr, &set); err != nil {
+ if _, err := t.CopyInBytes(addr, set); err != nil {
return nil, err
}
// If we only use part of the last byte, mask out the extraneous bits.
@@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
if file == nil {
return 0, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
var mask int16
if (rV & m) != 0 {
@@ -332,19 +332,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// Copy updated vectors back.
if readFDs != 0 {
- if _, err := t.CopyOut(readFDs, r); err != nil {
+ if _, err := t.CopyOutBytes(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
- if _, err := t.CopyOut(writeFDs, w); err != nil {
+ if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
- if _, err := t.CopyOut(exceptFDs, e); err != nil {
+ if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
return 0, err
}
}
@@ -415,7 +415,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration
nfds: nfds,
timeout: remainingTimeout,
})
- return 0, kernel.ERESTART_RESTARTBLOCK
+ return 0, syserror.ERESTART_RESTARTBLOCK
}
return n, err
}
@@ -462,7 +462,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that this means that if err is nil but copyErr is not, copyErr is
// ignored. This is consistent with Linux.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
@@ -492,11 +492,17 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
+// +marshal
+type sigSetWithSize struct {
+ sigsetAddr uint64
+ sizeofSigset uint64
+}
+
// Pselect implements linux syscall pselect(2).
func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
@@ -533,17 +539,11 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
- err = kernel.ERESTARTNOHAND
+ err = syserror.ERESTARTNOHAND
}
return n, nil, err
}
-// +marshal
-type sigSetWithSize struct {
- sigsetAddr uint64
- sizeofSigset uint64
-}
-
// copyTimespecInToDuration copies a Timespec from the untrusted app range,
// validates it and converts it to a Duration.
//
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 7f9debd4a..b77b29dcc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the size is legitimate.
si := int(size)
@@ -62,7 +62,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
n, err := read(t, file, dst, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
}
// Readv implements Linux syscall readv(2).
@@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Get the destination of the read.
dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
@@ -87,14 +87,14 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
n, err := read(t, file, dst, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
}
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return n, err
}
@@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return n, err
}
@@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
file.EventUnregister(&w)
if total > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return total, err
}
@@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -174,7 +174,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
}
// Preadv implements Linux syscall preadv(2).
@@ -188,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -205,7 +205,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
}
// Preadv2 implements Linux syscall preadv2(2).
@@ -226,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -251,14 +251,14 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
n, err = pread(t, file, dst, offset, opts)
}
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
}
func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
n, err := file.PRead(t, dst, offset, opts)
if err != syserror.ErrWouldBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return n, err
}
@@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return n, err
}
@@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
file.EventUnregister(&w)
if total > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return total, err
}
@@ -314,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the size is legitimate.
si := int(size)
@@ -332,7 +332,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
n, err := write(t, file, src, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
}
// Writev implements Linux syscall writev(2).
@@ -345,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Get the source of the write.
src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
@@ -357,14 +357,14 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
n, err := write(t, file, src, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
}
func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
n, err := file.Write(t, src, opts)
if err != syserror.ErrWouldBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
return n, err
}
@@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
return n, err
}
@@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
file.EventUnregister(&w)
if total > 0 {
- file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
return total, err
}
@@ -421,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -444,7 +444,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
}
// Pwritev implements Linux syscall pwritev(2).
@@ -458,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -475,7 +475,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
t.IOUsage().AccountReadSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
}
// Pwritev2 implements Linux syscall pwritev2(2).
@@ -496,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -521,14 +521,14 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
n, err = pwrite(t, file, src, offset, opts)
}
t.IOUsage().AccountWriteSyscall(n)
- return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
}
func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, err := file.PWrite(t, src, offset, opts)
if err != syserror.ErrWouldBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
return n, err
}
@@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
if n > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return n, err
}
@@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
file.EventUnregister(&w)
if total > 0 {
- file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
}
return total, err
}
@@ -601,8 +601,41 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newoff, err := file.Seek(t, offset, whence)
return uintptr(newoff), nil, err
}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ // Check that the file is readable.
+ if !file.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is valid.
+ if int(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Return EINVAL; if the underlying file type does not support readahead,
+ // then Linux will return EINVAL to indicate as much. In the future, we
+ // may extend this function to actually support readahead hints.
+ return 0, nil, syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 09ecfed26..1ee37e5a8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -65,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
@@ -150,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
@@ -178,6 +179,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
+ NeedWritePerm: true,
})
return 0, nil, handleSetSizeError(t, err)
}
@@ -195,7 +197,11 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
+
+ if !file.IsWritable() {
+ return 0, nil, syserror.EINVAL
+ }
err := file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
@@ -206,6 +212,56 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, handleSetSizeError(t, err)
}
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := args[1].Uint64()
+ offset := args[2].Int64()
+ length := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ if !file.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ if mode != 0 {
+ return 0, nil, syserror.ENOTSUP
+ }
+
+ if offset < 0 || length <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ size := offset + length
+
+ if size < 0 {
+ return 0, nil, syserror.EFBIG
+ }
+
+ limit := limits.FromContext(t).Get(limits.FileSize).Cur
+
+ if uint64(size) >= limit {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil {
+ return 0, nil, err
+ }
+
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ return 0, nil, nil
+}
+
// Utime implements Linux syscall utime(2).
func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
@@ -290,7 +346,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opt
return nil
}
var times [2]linux.Timeval
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
@@ -354,7 +410,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
return nil
}
var times [2]linux.Timespec
- if _, err := t.CopyIn(timesAddr, &times); err != nil {
+ if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
return err
}
if times[0].Nsec != linux.UTIME_OMIT {
@@ -382,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
@@ -390,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -401,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa
// VirtualFilesystem.SetStatAt(), since the former may be able
// to use opened file state to expedite the SetStat.
err := dirfile.SetStat(t, *opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
return err
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go
index 623992f6f..b89f34cdb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/signal.go
+++ b/pkg/sentry/syscalls/linux/vfs2/signal.go
@@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is this a signalfd?
if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
@@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Create a new descriptor.
fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 10b668477..7b33b3f59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -19,6 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -64,10 +66,10 @@ const flagsOffset = 48
const sizeOfInt32 = 4
// messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
// baseRecvFlags are the flags that are accepted across recvmsg(2),
// recvmmsg(2), and recvfrom(2).
@@ -75,6 +77,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
// MessageHeader64 is the 64-bit representation of the msghdr struct used in
// the recvmsg and sendmsg syscalls.
+//
+// +marshal
type MessageHeader64 struct {
// Name is the optional pointer to a network address buffer.
Name uint64
@@ -103,30 +107,14 @@ type MessageHeader64 struct {
// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
// the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
type multipleMessageHeader64 struct {
msgHdr MessageHeader64
msgLen uint32
_ int32
}
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
- b := t.CopyScratchBuffer(52)
- if _, err := t.CopyInBytes(addr, b); err != nil {
- return err
- }
-
- msg.Name = usermem.ByteOrder.Uint64(b[0:])
- msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
- msg.Iov = usermem.ByteOrder.Uint64(b[16:])
- msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
- msg.Control = usermem.ByteOrder.Uint64(b[32:])
- msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
- msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
- return nil
-}
-
// CaptureAddress allocates memory for and copies a socket address structure
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -145,10 +133,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
// writeAddress writes a sockaddr structure and its length to an output buffer
// in the unstrusted address space range. If the address is bigger than the
// buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
// Get the buffer length.
var bufLen uint32
- if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+ if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
return err
}
@@ -157,7 +145,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Write the length unconditionally.
- if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+ if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
return err
}
@@ -170,7 +158,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
}
// Copy as much of the address as will fit in the buffer.
- encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+ encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+ addr.MarshalUnsafe(encodedAddr)
if bufLen > uint32(len(encodedAddr)) {
bufLen = uint32(len(encodedAddr))
}
@@ -194,7 +183,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if e != nil {
return 0, nil, e.ToError()
}
- defer s.DecRef()
+ defer s.DecRef(t)
if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
return 0, nil, err
@@ -228,8 +217,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, e.ToError()
}
// Adding to the FD table will cause an extra reference to be acquired.
- defer s1.DecRef()
- defer s2.DecRef()
+ defer s1.DecRef(t)
+ defer s2.DecRef(t)
nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
@@ -248,10 +237,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, err
}
- if _, err := t.CopyOut(addr, fds); err != nil {
+ if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
for _, fd := range fds {
- if _, file := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ if _, file := t.FDTable().Remove(t, fd); file != nil {
+ file.DecRef(t)
}
}
return 0, nil, err
@@ -271,7 +260,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -286,7 +275,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
- return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+ return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
}
// accept is the implementation of the accept syscall. It is called by accept
@@ -302,7 +291,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -317,7 +306,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
peerRequested := addrLen != 0
nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
if peerRequested {
// NOTE(magi): Linux does not give you an error if it can't
@@ -361,7 +350,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -388,7 +377,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -417,7 +406,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -448,7 +437,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -457,8 +446,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Read the length. Reject negative values.
- optLen := int32(0)
- if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+ var optLen int32
+ if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
return 0, nil, err
}
if optLen < 0 {
@@ -472,12 +461,12 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
vLen := int32(binary.Size(v))
- if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+ if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
return 0, nil, err
}
if v != nil {
- if _, err := t.CopyOut(optValAddr, v); err != nil {
+ if _, err := v.CopyOut(t, optValAddr); err != nil {
return 0, nil, err
}
}
@@ -487,7 +476,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
-func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
if level == linux.SOL_SOCKET {
switch name {
case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -499,13 +488,16 @@ func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr
switch name {
case linux.SO_TYPE:
_, skType, _ := s.Type()
- return int32(skType), nil
+ v := primitive.Int32(skType)
+ return &v, nil
case linux.SO_DOMAIN:
family, _, _ := s.Type()
- return int32(family), nil
+ v := primitive.Int32(family)
+ return &v, nil
case linux.SO_PROTOCOL:
_, _, protocol := s.Type()
- return int32(protocol), nil
+ v := primitive.Int32(protocol)
+ return &v, nil
}
}
@@ -527,7 +519,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -542,7 +534,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, syserror.EINVAL
}
buf := t.CopyScratchBuffer(int(optLen))
- if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
return 0, nil, err
}
@@ -565,7 +557,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -593,7 +585,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -626,7 +618,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -679,7 +671,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -731,7 +723,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -746,7 +738,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
// Capture the message header and io vectors.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -769,16 +761,16 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
if msg.ControlLen == 0 && msg.NameLen == 0 {
n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
if err != nil {
- return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
}
if !cms.Unix.Empty() {
mflags |= linux.MSG_CTRUNC
- cms.Release()
+ cms.Release(t)
}
if int(msg.Flags) != mflags {
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
}
@@ -791,9 +783,9 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
}
n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
- defer cms.Release()
+ defer cms.Release(t)
controlData := make([]byte, 0, msg.ControlLen)
controlData = control.PackControlMessages(t, cms, controlData)
@@ -815,17 +807,17 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
}
// Copy the control data to the caller.
- if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+ if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
return 0, err
}
if len(controlData) > 0 {
- if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+ if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
// Copy out the flags to the caller.
- if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+ if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
return 0, err
}
@@ -849,7 +841,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -878,9 +870,9 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
}
n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
- cm.Release()
+ cm.Release(t)
if e != nil {
- return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+ return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
}
// Copy the address to the caller.
@@ -922,7 +914,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -960,7 +952,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -994,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if !ok {
return 0, nil, syserror.EFAULT
}
- if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+ if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
break
}
count++
@@ -1009,7 +1001,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
// Capture the message header.
var msg MessageHeader64
- if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+ if _, err := msg.CopyIn(t, msgPtr); err != nil {
return 0, err
}
@@ -1020,7 +1012,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
return 0, syserror.ENOBUFS
}
controlData = make([]byte, msg.ControlLen)
- if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+ if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
return 0, err
}
}
@@ -1062,9 +1054,11 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
- err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
- if err != nil {
- controlMessages.Release()
+ err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
+ // Control messages should be released on error as well as for zero-length
+ // messages, which are discarded by the receiver.
+ if n == 0 || err != nil {
+ controlMessages.Release(t)
}
return uintptr(n), err
}
@@ -1082,7 +1076,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -1122,7 +1116,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
// Call the syscall implementation.
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
- return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+ return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
}
// SendTo implements the linux syscall sendto(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 945a364a7..035e2a6b0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -15,12 +15,18 @@
package vfs2
import (
+ "io"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -39,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if count > int64(kernel.MAX_RW_COUNT) {
count = int64(kernel.MAX_RW_COUNT)
}
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
// Check for invalid flags.
if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -50,12 +59,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
outFile := t.GetFileVFS2(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
// Check that both files support the required directionality.
if !inFile.IsReadable() || !outFile.IsWritable() {
@@ -85,7 +94,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if inFile.Options().DenyPRead {
return 0, nil, syserror.EINVAL
}
- if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+ if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
return 0, nil, err
}
if inOffset < 0 {
@@ -100,7 +109,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if outFile.Options().DenyPWrite {
return 0, nil, syserror.EINVAL
}
- if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+ if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
return 0, nil, err
}
if outOffset < 0 {
@@ -110,89 +119,67 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Move data.
var (
- n int64
- err error
- inCh chan struct{}
- outCh chan struct{}
+ n int64
+ err error
)
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
for {
// If both input and output are pipes, delegate to the pipe
- // implementation. Otherwise, exactly one end is a pipe, which we
- // ensure is consistently ordered after the non-pipe FD's locks by
- // passing the pipe FD as usermem.IO to the non-pipe end.
+ // implementation. Otherwise, exactly one end is a pipe, which
+ // we ensure is consistently ordered after the non-pipe FD's
+ // locks by passing the pipe FD as usermem.IO to the non-pipe
+ // end.
switch {
case inIsPipe && outIsPipe:
n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
case inIsPipe:
+ n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
if outOffset != -1 {
- n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
outOffset += n
- } else {
- n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
}
case outIsPipe:
+ n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
if inOffset != -1 {
- n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
inOffset += n
- } else {
- n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
}
+ default:
+ panic("at least one end of splice must be a pipe")
}
+
if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
break
}
-
- // Note that the blocking behavior here is a bit different than the
- // normal pattern. Because we need to have both data to read and data
- // to write simultaneously, we actually explicitly block on both of
- // these cases in turn before returning to the splice operation.
- if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
- if inCh == nil {
- inCh = make(chan struct{}, 1)
- inW, _ := waiter.NewChannelEntry(inCh)
- inFile.EventRegister(&inW, eventMaskRead)
- defer inFile.EventUnregister(&inW)
- continue // Need to refresh readiness.
- }
- if err = t.Block(inCh); err != nil {
- break
- }
- }
- if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
- if outCh == nil {
- outCh = make(chan struct{}, 1)
- outW, _ := waiter.NewChannelEntry(outCh)
- outFile.EventRegister(&outW, eventMaskWrite)
- defer outFile.EventUnregister(&outW)
- continue // Need to refresh readiness.
- }
- if err = t.Block(outCh); err != nil {
- break
- }
+ if err = dw.waitForBoth(t); err != nil {
+ break
}
}
// Copy updated offsets out.
if inOffsetPtr != 0 {
- if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
return 0, nil, err
}
}
if outOffsetPtr != 0 {
- if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+ if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
return 0, nil, err
}
}
- if n == 0 {
- return 0, nil, err
+ if n != 0 {
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
}
- // On Linux, inotify behavior is not very consistent with splice(2). We try
- // our best to emulate Linux for very basic calls to splice, where for some
- // reason, events are generated for output files, but not input files.
- outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
- return uintptr(n), nil, nil
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
}
// Tee implements Linux syscall tee(2).
@@ -208,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if count > int64(kernel.MAX_RW_COUNT) {
count = int64(kernel.MAX_RW_COUNT)
}
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
// Check for invalid flags.
if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -219,12 +209,12 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
outFile := t.GetFileVFS2(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
// Check that both files support the required directionality.
if !inFile.IsReadable() || !outFile.IsWritable() {
@@ -247,45 +237,274 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
// Copy data.
var (
- inCh chan struct{}
- outCh chan struct{}
+ n int64
+ err error
)
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
for {
- n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
- if n != 0 {
- return uintptr(n), nil, nil
+ n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
+ if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+ break
+ }
+ if err = dw.waitForBoth(t); err != nil {
+ break
+ }
+ }
+
+ if n != 0 {
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+ // If a partial write is completed, the error is dropped. Log it here.
+ if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+ log.Debugf("tee completed a partial write with error: %v", err)
+ err = nil
}
- if err != syserror.ErrWouldBlock || nonBlock {
+ }
+
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ outFD := args[0].Int()
+ inFD := args[1].Int()
+ offsetAddr := args[2].Pointer()
+ count := int64(args[3].SizeT())
+
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef(t)
+ if !inFile.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef(t)
+ if !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Verify that the outFile Append flag is not set.
+ if outFile.StatusFlags()&linux.O_APPEND != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Verify that inFile is a regular file or block device. This is a
+ // requirement; the same check appears in Linux
+ // (fs/splice.c:splice_direct_to_actor).
+ if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
+ return 0, nil, err
+ } else if stat.Mask&linux.STATX_TYPE == 0 ||
+ (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy offset if it exists.
+ offset := int64(-1)
+ if offsetAddr != 0 {
+ if inFile.Options().DenyPRead {
+ return 0, nil, syserror.ESPIPE
+ }
+ var offsetP primitive.Int64
+ if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
return 0, nil, err
}
+ offset = int64(offsetP)
+
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if offset+count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ // Validate count. This must come after offset checks.
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
- // Note that the blocking behavior here is a bit different than the
- // normal pattern. Because we need to have both data to read and data
- // to write simultaneously, we actually explicitly block on both of
- // these cases in turn before returning to the tee operation.
- if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
- if inCh == nil {
- inCh = make(chan struct{}, 1)
- inW, _ := waiter.NewChannelEntry(inCh)
- inFile.EventRegister(&inW, eventMaskRead)
- defer inFile.EventUnregister(&inW)
- continue // Need to refresh readiness.
+ // Copy data.
+ var (
+ n int64
+ err error
+ )
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ // Reading from input file should never block, since it is regular or
+ // block device. We only need to check if writing to the output file
+ // can block.
+ nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
+ if outIsPipe {
+ for n < count {
+ var spliceN int64
+ spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
+ if offset != -1 {
+ offset += spliceN
}
- if err := t.Block(inCh); err != nil {
- return 0, nil, err
+ n += spliceN
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForBoth(t)
+ }
+ if err != nil {
+ break
}
}
- if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
- if outCh == nil {
- outCh = make(chan struct{}, 1)
- outW, _ := waiter.NewChannelEntry(outCh)
- outFile.EventRegister(&outW, eventMaskWrite)
- defer outFile.EventUnregister(&outW)
- continue // Need to refresh readiness.
+ } else {
+ // Read inFile to buffer, then write the contents to outFile.
+ buf := make([]byte, count)
+ for n < count {
+ var readN int64
+ if offset != -1 {
+ readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
+ offset += readN
+ } else {
+ readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
}
- if err := t.Block(outCh); err != nil {
- return 0, nil, err
+ n += readN
+
+ // Write all of the bytes that we read. This may need
+ // multiple write calls to complete.
+ wbuf := buf[:readN]
+ for len(wbuf) > 0 {
+ var writeN int64
+ writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
+ wbuf = wbuf[writeN:]
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForOut(t)
+ }
+ if err != nil {
+ // We didn't complete the write. Only report the bytes that were actually
+ // written, and rewind offsets as needed.
+ notWritten := int64(len(wbuf))
+ n -= notWritten
+ if offset == -1 {
+ // We modified the offset of the input file itself during the read
+ // operation. Rewind it.
+ if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
+ // Log the error but don't return it, since the write has already
+ // completed successfully.
+ log.Warningf("failed to roll back input file offset: %v", seekErr)
+ }
+ } else {
+ // The sendfile call was provided an offset parameter that should be
+ // adjusted to reflect the number of bytes sent. Rewind it.
+ offset -= notWritten
+ }
+ break
+ }
+ }
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForBoth(t)
+ }
+ if err != nil {
+ break
}
}
}
+
+ if offsetAddr != 0 {
+ // Copy out the new offset.
+ offsetP := primitive.Uint64(offset)
+ if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if n != 0 {
+ inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+ if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+ // If a partial write is completed, the error is dropped. Log it here.
+ log.Debugf("sendfile completed a partial write with error: %v", err)
+ err = nil
+ }
+ }
+
+ // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+ // This is used only for debugging purposes.
+ return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
+}
+
+// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
+// thread-safe, and does not take a reference on the vfs.FileDescriptions.
+//
+// Users must call destroy() when finished.
+type dualWaiter struct {
+ inFile *vfs.FileDescription
+ outFile *vfs.FileDescription
+
+ inW waiter.Entry
+ inCh chan struct{}
+ outW waiter.Entry
+ outCh chan struct{}
+}
+
+// waitForBoth waits for both dw.inFile and dw.outFile to be ready.
+func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
+ if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if dw.inCh == nil {
+ dw.inW, dw.inCh = waiter.NewChannelEntry(nil)
+ dw.inFile.EventRegister(&dw.inW, eventMaskRead)
+ // We might be ready now. Try again before blocking.
+ return nil
+ }
+ if err := t.Block(dw.inCh); err != nil {
+ return err
+ }
+ }
+ return dw.waitForOut(t)
+}
+
+// waitForOut waits for dw.outfile to be read.
+func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
+ if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if dw.outCh == nil {
+ dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
+ dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
+ // We might be ready now. Try again before blocking.
+ return nil
+ }
+ if err := t.Block(dw.outCh); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// destroy cleans up resources help by dw. No more calls to wait* can occur
+// after destroy is called.
+func (dw *dualWaiter) destroy() {
+ if dw.inCh != nil {
+ dw.inFile.EventUnregister(&dw.inW)
+ dw.inCh = nil
+ }
+ if dw.outCh != nil {
+ dw.outFile.EventUnregister(&dw.outW)
+ dw.outCh = nil
+ }
+ dw.inFile = nil
+ dw.outFile = nil
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index bb1d5cac4..0f5d5189c 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
@@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
if err != nil {
return err
}
@@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
@@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
statx, err := file.Stat(t, vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
@@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
@@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
if err != nil {
return 0, nil, err
}
@@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
@@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
// access(2) and faccessat(2) check permissions using real
// UID/GID, not effective UID/GID.
@@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
if err != nil {
@@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
@@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
index 365250b0b..6e9b599e2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/sync.go
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.SyncFS(t)
}
@@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.Sync(t)
}
@@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
nbytes := args[2].Int64()
flags := args[3].Uint()
- if offset < 0 {
- return 0, nil, syserror.EINVAL
- }
- if nbytes < 0 {
+ // Check for negative values and overflow.
+ if offset < 0 || offset+nbytes < 0 {
return 0, nil, syserror.EINVAL
}
if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
@@ -79,9 +77,39 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
- // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
- // [offset, offset+nbytes).
- return 0, nil, file.Sync(t)
+ // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
+ // is a full-file sync, i.e. fsync(2). As a result, there are severe
+ // limitations on how much we support sync_file_range:
+ // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
+ // if the file size is changed. We do.
+ // - We always sync the entire file instead of [offset, offset+nbytes).
+ // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
+ // correctness, we would have to perform a write-out every time WAIT_BEFORE
+ // was used, but this would be much more expensive than expected if there
+ // were no write-out operations in progress.
+ // - Whenever WAIT_AFTER is used, we sync the file.
+ // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
+ // be synced anyway. If this flag is used without WAIT_AFTER, then it is
+ // safe (and less expensive) to do nothing, because the syscall will not
+ // wait for the write-out to complete--we only need to make sure that the
+ // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
+ // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
+ // errors or ENOSPC conditions and will return those to the caller, after
+ // clearing the EIO and ENOSPC flags in the address_space." We don't do
+ // this.
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+ flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+ if err := file.Sync(t); err != nil {
+ return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
+ }
+ }
+ return 0, nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
index 5ac79bc09..250870c03 100644
--- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.EINVAL
}
vfsObj := t.Kernel().VFS()
- file, err := timerfd.New(vfsObj, clock, fileFlags)
+ file, err := timerfd.New(t, vfsObj, clock, fileFlags)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
})
@@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
if !ok {
@@ -87,7 +87,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
}
var newVal linux.Itimerspec
- if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+ if _, err := newVal.CopyIn(t, newValAddr); err != nil {
return 0, nil, err
}
newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
@@ -97,7 +97,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, oldS := tfd.SetTime(newS)
if oldValAddr != 0 {
oldVal := ktime.ItimerspecFromSetting(tm, oldS)
- if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+ if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
return 0, nil, err
}
}
@@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
if !ok {
@@ -122,6 +122,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
tm, s := tfd.GetTime()
curVal := ktime.ItimerspecFromSetting(tm, s)
- _, err := t.CopyOut(curValAddr, &curVal)
+ _, err := curVal.CopyOut(t, curValAddr)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index ef8358b8a..c50fd97eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -44,7 +44,7 @@ func Override() {
s.Table[23] = syscalls.Supported("select", Select)
s.Table[32] = syscalls.Supported("dup", Dup)
s.Table[33] = syscalls.Supported("dup2", Dup2)
- delete(s.Table, 40) // sendfile
+ s.Table[40] = syscalls.Supported("sendfile", Sendfile)
s.Table[41] = syscalls.Supported("socket", Socket)
s.Table[42] = syscalls.Supported("connect", Connect)
s.Table[43] = syscalls.Supported("accept", Accept)
@@ -62,7 +62,7 @@ func Override() {
s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
s.Table[59] = syscalls.Supported("execve", Execve)
s.Table[72] = syscalls.Supported("fcntl", Fcntl)
- delete(s.Table, 73) // flock
+ s.Table[73] = syscalls.Supported("flock", Flock)
s.Table[74] = syscalls.Supported("fsync", Fsync)
s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
s.Table[76] = syscalls.Supported("truncate", Truncate)
@@ -90,29 +90,25 @@ func Override() {
s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
s.Table[161] = syscalls.Supported("chroot", Chroot)
s.Table[162] = syscalls.Supported("sync", Sync)
- delete(s.Table, 165) // mount
- delete(s.Table, 166) // umount2
- delete(s.Table, 187) // readahead
- s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+ s.Table[165] = syscalls.Supported("mount", Mount)
+ s.Table[166] = syscalls.Supported("umount2", Umount2)
+ s.Table[187] = syscalls.Supported("readahead", Readahead)
+ s.Table[188] = syscalls.Supported("setxattr", SetXattr)
s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
- s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+ s.Table[191] = syscalls.Supported("getxattr", GetXattr)
s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
- s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+ s.Table[194] = syscalls.Supported("listxattr", ListXattr)
s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
- s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+ s.Table[197] = syscalls.Supported("removexattr", RemoveXattr)
s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
- delete(s.Table, 206) // io_setup
- delete(s.Table, 207) // io_destroy
- delete(s.Table, 208) // io_getevents
- delete(s.Table, 209) // io_submit
- delete(s.Table, 210) // io_cancel
+ s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
s.Table[217] = syscalls.Supported("getdents64", Getdents64)
- delete(s.Table, 221) // fdavise64
+ s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil)
s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
s.Table[235] = syscalls.Supported("utimes", Utimes)
@@ -142,7 +138,7 @@ func Override() {
s.Table[282] = syscalls.Supported("signalfd", Signalfd)
s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
s.Table[284] = syscalls.Supported("eventfd", Eventfd)
- delete(s.Table, 285) // fallocate
+ s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
s.Table[288] = syscalls.Supported("accept4", Accept4)
@@ -167,6 +163,112 @@ func Override() {
// Override ARM64.
s = linux.ARM64
+ s.Table[2] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
+ s.Table[5] = syscalls.Supported("setxattr", SetXattr)
+ s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
+ s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
+ s.Table[8] = syscalls.Supported("getxattr", GetXattr)
+ s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
+ s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
+ s.Table[11] = syscalls.Supported("listxattr", ListXattr)
+ s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
+ s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
+ s.Table[14] = syscalls.Supported("removexattr", RemoveXattr)
+ s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
+ s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
+ s.Table[17] = syscalls.Supported("getcwd", Getcwd)
+ s.Table[19] = syscalls.Supported("eventfd2", Eventfd2)
+ s.Table[20] = syscalls.Supported("epoll_create1", EpollCreate1)
+ s.Table[21] = syscalls.Supported("epoll_ctl", EpollCtl)
+ s.Table[22] = syscalls.Supported("epoll_pwait", EpollPwait)
+ s.Table[23] = syscalls.Supported("dup", Dup)
+ s.Table[24] = syscalls.Supported("dup3", Dup3)
+ s.Table[25] = syscalls.Supported("fcntl", Fcntl)
+ s.Table[26] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+ s.Table[27] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[28] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[29] = syscalls.Supported("ioctl", Ioctl)
+ s.Table[32] = syscalls.Supported("flock", Flock)
+ s.Table[33] = syscalls.Supported("mknodat", Mknodat)
+ s.Table[34] = syscalls.Supported("mkdirat", Mkdirat)
+ s.Table[35] = syscalls.Supported("unlinkat", Unlinkat)
+ s.Table[36] = syscalls.Supported("symlinkat", Symlinkat)
+ s.Table[37] = syscalls.Supported("linkat", Linkat)
+ s.Table[38] = syscalls.Supported("renameat", Renameat)
+ s.Table[39] = syscalls.Supported("umount2", Umount2)
+ s.Table[40] = syscalls.Supported("mount", Mount)
+ s.Table[43] = syscalls.Supported("statfs", Statfs)
+ s.Table[44] = syscalls.Supported("fstatfs", Fstatfs)
+ s.Table[45] = syscalls.Supported("truncate", Truncate)
+ s.Table[46] = syscalls.Supported("ftruncate", Ftruncate)
+ s.Table[47] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
+ s.Table[48] = syscalls.Supported("faccessat", Faccessat)
+ s.Table[49] = syscalls.Supported("chdir", Chdir)
+ s.Table[50] = syscalls.Supported("fchdir", Fchdir)
+ s.Table[51] = syscalls.Supported("chroot", Chroot)
+ s.Table[52] = syscalls.Supported("fchmod", Fchmod)
+ s.Table[53] = syscalls.Supported("fchmodat", Fchmodat)
+ s.Table[54] = syscalls.Supported("fchownat", Fchownat)
+ s.Table[55] = syscalls.Supported("fchown", Fchown)
+ s.Table[56] = syscalls.Supported("openat", Openat)
+ s.Table[57] = syscalls.Supported("close", Close)
+ s.Table[59] = syscalls.Supported("pipe2", Pipe2)
+ s.Table[61] = syscalls.Supported("getdents64", Getdents64)
+ s.Table[62] = syscalls.Supported("lseek", Lseek)
s.Table[63] = syscalls.Supported("read", Read)
+ s.Table[64] = syscalls.Supported("write", Write)
+ s.Table[65] = syscalls.Supported("readv", Readv)
+ s.Table[66] = syscalls.Supported("writev", Writev)
+ s.Table[67] = syscalls.Supported("pread64", Pread64)
+ s.Table[68] = syscalls.Supported("pwrite64", Pwrite64)
+ s.Table[69] = syscalls.Supported("preadv", Preadv)
+ s.Table[70] = syscalls.Supported("pwritev", Pwritev)
+ s.Table[71] = syscalls.Supported("sendfile", Sendfile)
+ s.Table[72] = syscalls.Supported("pselect", Pselect)
+ s.Table[73] = syscalls.Supported("ppoll", Ppoll)
+ s.Table[74] = syscalls.Supported("signalfd4", Signalfd4)
+ s.Table[76] = syscalls.Supported("splice", Splice)
+ s.Table[77] = syscalls.Supported("tee", Tee)
+ s.Table[78] = syscalls.Supported("readlinkat", Readlinkat)
+ s.Table[79] = syscalls.Supported("newfstatat", Newfstatat)
+ s.Table[80] = syscalls.Supported("fstat", Fstat)
+ s.Table[81] = syscalls.Supported("sync", Sync)
+ s.Table[82] = syscalls.Supported("fsync", Fsync)
+ s.Table[83] = syscalls.Supported("fdatasync", Fdatasync)
+ s.Table[84] = syscalls.Supported("sync_file_range", SyncFileRange)
+ s.Table[85] = syscalls.Supported("timerfd_create", TimerfdCreate)
+ s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+ s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+ s.Table[88] = syscalls.Supported("utimensat", Utimensat)
+ s.Table[198] = syscalls.Supported("socket", Socket)
+ s.Table[199] = syscalls.Supported("socketpair", SocketPair)
+ s.Table[200] = syscalls.Supported("bind", Bind)
+ s.Table[201] = syscalls.Supported("listen", Listen)
+ s.Table[202] = syscalls.Supported("accept", Accept)
+ s.Table[203] = syscalls.Supported("connect", Connect)
+ s.Table[204] = syscalls.Supported("getsockname", GetSockName)
+ s.Table[205] = syscalls.Supported("getpeername", GetPeerName)
+ s.Table[206] = syscalls.Supported("sendto", SendTo)
+ s.Table[207] = syscalls.Supported("recvfrom", RecvFrom)
+ s.Table[208] = syscalls.Supported("setsockopt", SetSockOpt)
+ s.Table[209] = syscalls.Supported("getsockopt", GetSockOpt)
+ s.Table[210] = syscalls.Supported("shutdown", Shutdown)
+ s.Table[211] = syscalls.Supported("sendmsg", SendMsg)
+ s.Table[212] = syscalls.Supported("recvmsg", RecvMsg)
+ s.Table[213] = syscalls.Supported("readahead", Readahead)
+ s.Table[221] = syscalls.Supported("execve", Execve)
+ s.Table[222] = syscalls.Supported("mmap", Mmap)
+ s.Table[223] = syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil)
+ s.Table[242] = syscalls.Supported("accept4", Accept4)
+ s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg)
+ s.Table[267] = syscalls.Supported("syncfs", Syncfs)
+ s.Table[269] = syscalls.Supported("sendmmsg", SendMMsg)
+ s.Table[276] = syscalls.Supported("renameat2", Renameat2)
+ s.Table[279] = syscalls.Supported("memfd_create", MemfdCreate)
+ s.Table[281] = syscalls.Supported("execveat", Execveat)
+ s.Table[286] = syscalls.Supported("preadv2", Preadv2)
+ s.Table[287] = syscalls.Supported("pwritev2", Pwritev2)
+ s.Table[291] = syscalls.Supported("statx", Statx)
+
s.Init()
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index af455d5c1..e05723ef9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -26,8 +26,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// Listxattr implements Linux syscall listxattr(2).
-func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// ListXattr implements Linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return listxattr(t, args, followFinalSymlink)
}
@@ -49,9 +49,9 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
- names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+ names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
if err != nil {
return 0, nil, err
}
@@ -72,9 +72,9 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
- names, err := file.Listxattr(t, uint64(size))
+ names, err := file.ListXattr(t, uint64(size))
if err != nil {
return 0, nil, err
}
@@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return uintptr(n), nil, nil
}
-// Getxattr implements Linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements Linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getxattr(t, args, followFinalSymlink)
}
@@ -109,14 +109,14 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
- value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+ value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
Name: name,
Size: uint64(size),
})
@@ -141,14 +141,14 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
- value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+ value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
if err != nil {
return 0, nil, err
}
@@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return uintptr(n), nil, nil
}
-// Setxattr implements Linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements Linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, setxattr(t, args, followFinalSymlink)
}
@@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
return err
}
- return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+ return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
@@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, err
}
- return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+ return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
})
}
-// Removexattr implements Linux syscall removexattr(2).
-func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// RemoveXattr implements Linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, removexattr(t, args, followFinalSymlink)
}
@@ -262,14 +262,14 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return err
}
- return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+ return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
}
// Fremovexattr implements Linux syscall fremovexattr(2).
@@ -281,14 +281,14 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
- return 0, nil, file.Removexattr(t, name)
+ return 0, nil, file.RemoveXattr(t, name)
}
func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5ad57a8a3..8afc62d53 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -12,12 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include "funcdata.h"
#include "textflag.h"
// Documentation is available in parameters.go.
//
// func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
TEXT ·muldiv64(SB),NOSPLIT,$40-33
+ GO_ARGS
+ NO_LOCAL_POINTERS
MOVD value+0(FP), R0
MOVD multiplier+8(FP), R1
MOVD divisor+16(FP), R2
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 65868cb26..cd1b95117 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -228,11 +228,15 @@ func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Par
//
// The log level is determined by the error severity.
func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) {
- fn := log.Debugf
- if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() {
+ magNS := int64(errorNS.Magnitude())
+ if magNS <= 10*time.Microsecond.Nanoseconds() {
+ // Don't log small errors.
+ return
+ }
+ fn := log.Infof
+ if magNS > time.Millisecond.Nanoseconds() {
+ // Upgrade large errors to warning.
fn = log.Warningf
- } else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() {
- fn = log.Infof
}
fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency)
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index e1b9084ac..0ce1257f6 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -484,3 +484,18 @@ func TestMulDivOverflow(t *testing.T) {
})
}
}
+
+func BenchmarkMuldiv64(b *testing.B) {
+ var v uint64 = math.MaxUint64
+ for i := uint64(1); i <= 1000000; i++ {
+ mult := uint64(1000000000)
+ div := i * mult
+ res, ok := muldiv64(v, mult, div)
+ if !ok {
+ b.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div)
+ }
+ if want := v / i; res != want {
+ b.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want)
+ }
+ }
+}
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index ab1d140d2..5ed6726ab 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 {
}
if memSize < used {
memSize = used
- // Bump totalSize to the next largest power of 2, if one exists, so
+ // Bump memSize to the next largest power of 2, if one exists, so
// that MemFree isn't 0.
if msb := bits.MostSignificantOne64(memSize); msb < 63 {
memSize = uint64(1) << (uint(msb) + 1)
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 774cc66cc..440c9307c 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -27,6 +27,39 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "file_description_refs",
+ out = "file_description_refs.go",
+ package = "vfs",
+ prefix = "FileDescription",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "FileDescription",
+ },
+)
+
+go_template_instance(
+ name = "mount_namespace_refs",
+ out = "mount_namespace_refs.go",
+ package = "vfs",
+ prefix = "MountNamespace",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "MountNamespace",
+ },
+)
+
+go_template_instance(
+ name = "filesystem_refs",
+ out = "filesystem_refs.go",
+ package = "vfs",
+ prefix = "Filesystem",
+ template = "//pkg/refsvfs2:refs_template",
+ types = {
+ "T": "Filesystem",
+ },
+)
+
go_library(
name = "vfs",
srcs = [
@@ -40,28 +73,34 @@ go_library(
"event_list.go",
"file_description.go",
"file_description_impl_util.go",
+ "file_description_refs.go",
"filesystem.go",
"filesystem_impl_util.go",
+ "filesystem_refs.go",
"filesystem_type.go",
"inotify.go",
+ "lock.go",
"mount.go",
+ "mount_namespace_refs.go",
"mount_unsafe.go",
"options.go",
"pathname.go",
"permissions.go",
"resolving_path.go",
+ "save_restore.go",
"vfs.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/binary",
"//pkg/context",
"//pkg/fd",
"//pkg/fdnotifier",
"//pkg/fspath",
"//pkg/gohacks",
"//pkg/log",
+ "//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..5aad31b78 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
- Mount: Each referenced Mount holds a reference on its parent, which is the
mount containing its mount point.
-- VirtualFilesystem: A reference is held on each Mount that has not been
- umounted.
+- VirtualFilesystem: A reference is held on each Mount that has been connected
+ to a mount point, but not yet umounted.
MountNamespace and FileDescription references are held by users of VFS. The
expectation is that each `kernel.Task` holds a reference on its corresponding
@@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's
- binder, which is similarly far too incomplete to use.
- - whitelistfs, which we are already actively attempting to remove.
-
- Save/restore. For instance, it is unclear if the current implementation of
the `state` package supports the inheritance pattern described above.
@@ -186,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's
- File locking
- `O_ASYNC`
-
-- Reference counts in the `vfs` package do not use the `refs` package since
- `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
- count, resulting in considerable cache bloat. 24 bytes of this overhead is
- for weak reference support, which have poor performance and will not be used
- by VFS2. The remaining 40 bytes is to store a descriptive string and stack
- trace for reference leak checking; we can support reference leak checking
- without incurring this space overhead by including the applicable
- information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index b7c6b60b8..7ad0eaf86 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -52,6 +52,8 @@ const (
)
// anonFilesystemType implements FilesystemType.
+//
+// +stateify savable
type anonFilesystemType struct{}
// GetFilesystem implements FilesystemType.GetFilesystem.
@@ -59,22 +61,28 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au
panic("cannot instaniate an anon filesystem")
}
-// Name implemenents FilesystemType.Name.
+// Name implements FilesystemType.Name.
func (anonFilesystemType) Name() string {
return "none"
}
+// Release implemenents FilesystemType.Release.
+func (anonFilesystemType) Release(ctx context.Context) {}
+
// anonFilesystem is the implementation of FilesystemImpl that backs
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
//
// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
// methods that would require an anonDentry to be a directory return ENOTDIR.
+//
+// +stateify savable
type anonFilesystem struct {
vfsfs Filesystem
devMinor uint32
}
+// +stateify savable
type anonDentry struct {
vfsd Dentry
@@ -82,7 +90,7 @@ type anonDentry struct {
}
// Release implements FilesystemImpl.Release.
-func (fs *anonFilesystem) Release() {
+func (fs *anonFilesystem) Release(ctx context.Context) {
}
// Sync implements FilesystemImpl.Sync.
@@ -245,32 +253,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements FilesystemImpl.ListXattrAt.
+func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
if !rp.Done() {
return nil, syserror.ENOTDIR
}
return nil, nil
}
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+// GetXattrAt implements FilesystemImpl.GetXattrAt.
+func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
if !rp.Done() {
return "", syserror.ENOTDIR
}
return "", syserror.ENOTSUP
}
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+// SetXattrAt implements FilesystemImpl.SetXattrAt.
+func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
if !rp.Done() {
return syserror.ENOTDIR
}
return syserror.EPERM
}
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
+func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
if !rp.Done() {
return syserror.ENOTDIR
}
@@ -294,18 +302,21 @@ func (d *anonDentry) TryIncRef() bool {
}
// DecRef implements DentryImpl.DecRef.
-func (d *anonDentry) DecRef() {
+func (d *anonDentry) DecRef(ctx context.Context) {
// no-op
}
// InotifyWithParent implements DentryImpl.InotifyWithParent.
//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {}
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {}
// Watches implements DentryImpl.Watches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
func (d *anonDentry) Watches() *Watches {
return nil
}
+
+// OnZeroWatches implements Dentry.OnZeroWatches.
+func (d *anonDentry) OnZeroWatches(context.Context) {}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index c9e724fef..97018651f 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
return nil
}
+type mountNamespaceContext struct {
+ context.Context
+ mntns *MountNamespace
+}
+
+// Value implements Context.Value.
+func (mc mountNamespaceContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxMountNamespace:
+ mc.mntns.IncRef()
+ return mc.mntns
+ default:
+ return mc.Context.Value(key)
+ }
+}
+
+// WithMountNamespace returns a copy of ctx with the given MountNamespace.
+func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context {
+ return &mountNamespaceContext{
+ Context: ctx,
+ mntns: mntns,
+ }
+}
+
// RootFromContext returns the VFS root used by ctx. It takes a reference on
// the returned VirtualDentry. If ctx does not have a specific VFS root,
// RootFromContext returns a zero-value VirtualDentry.
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 24af13eb1..320ab7ce1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -17,6 +17,7 @@ package vfs
import (
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -88,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl {
// DentryImpl contains implementation details for a Dentry. Implementations of
// DentryImpl should contain their associated Dentry by value as their first
// field.
+//
+// +stateify savable
type DentryImpl interface {
// IncRef increments the Dentry's reference count. A Dentry with a non-zero
// reference count must remain coherent with the state of the filesystem.
@@ -102,7 +105,7 @@ type DentryImpl interface {
TryIncRef() bool
// DecRef decrements the Dentry's reference count.
- DecRef()
+ DecRef(ctx context.Context)
// InotifyWithParent notifies all watches on the targets represented by this
// dentry and its parent. The parent's watches are notified first, followed
@@ -113,12 +116,29 @@ type DentryImpl interface {
//
// Note that the events may not actually propagate up to the user, depending
// on the event masks.
- InotifyWithParent(events uint32, cookie uint32, et EventType)
+ InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType)
// Watches returns the set of inotify watches for the file corresponding to
// the Dentry. Dentries that are hard links to the same underlying file
// share the same watches.
+ //
+ // Watches may return nil if the dentry belongs to a FilesystemImpl that
+ // does not support inotify. If an implementation returns a non-nil watch
+ // set, it must always return a non-nil watch set. Likewise, if an
+ // implementation returns a nil watch set, it must always return a nil watch
+ // set.
+ //
+ // The caller does not need to hold a reference on the dentry.
Watches() *Watches
+
+ // OnZeroWatches is called whenever the number of watches on a dentry drops
+ // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage
+ // dentry lifetime.
+ //
+ // The caller does not need to hold a reference on the dentry. OnZeroWatches
+ // may acquire inotify locks, so to prevent deadlock, no inotify locks should
+ // be held by the caller.
+ OnZeroWatches(ctx context.Context)
}
// IncRef increments d's reference count.
@@ -133,8 +153,8 @@ func (d *Dentry) TryIncRef() bool {
}
// DecRef decrements d's reference count.
-func (d *Dentry) DecRef() {
- d.impl.DecRef()
+func (d *Dentry) DecRef(ctx context.Context) {
+ d.impl.DecRef(ctx)
}
// IsDead returns true if d has been deleted or invalidated by its owning
@@ -149,17 +169,26 @@ func (d *Dentry) isMounted() bool {
return atomic.LoadUint32(&d.mounts) != 0
}
-// InotifyWithParent notifies all watches on the inodes for this dentry and
+// InotifyWithParent notifies all watches on the targets represented by d and
// its parent of events.
-func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {
- d.impl.InotifyWithParent(events, cookie, et)
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {
+ d.impl.InotifyWithParent(ctx, events, cookie, et)
}
// Watches returns the set of inotify watches associated with d.
+//
+// Watches will return nil if d belongs to a FilesystemImpl that does not
+// support inotify.
func (d *Dentry) Watches() *Watches {
return d.impl.Watches()
}
+// OnZeroWatches performs cleanup tasks whenever the number of watches on a
+// dentry drops to zero.
+func (d *Dentry) OnZeroWatches(ctx context.Context) {
+ d.impl.OnZeroWatches(ctx)
+}
+
// The following functions are exported so that filesystem implementations can
// use them. The vfs package, and users of VFS, should not call these
// functions.
@@ -188,11 +217,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
// succeeds.
-func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
+func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) {
d.dead = true
d.mu.Unlock()
if d.isMounted() {
- vfs.forgetDeadMountpoint(d)
+ vfs.forgetDeadMountpoint(ctx, d)
}
}
@@ -200,12 +229,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
// did for reasons outside of VFS' control (e.g. d represents the local state
// of a file on a remote filesystem on which the file has already been
// deleted).
-func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
+func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
d.mu.Lock()
d.dead = true
d.mu.Unlock()
if d.isMounted() {
- vfs.forgetDeadMountpoint(d)
+ vfs.forgetDeadMountpoint(ctx, d)
}
}
@@ -215,8 +244,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
// CommitRenameExchangeDentry depending on the rename's outcome.
//
-// Preconditions: If to is not nil, it must be a child Dentry from the same
-// Filesystem. from != to.
+// Preconditions:
+// * If to is not nil, it must be a child Dentry from the same Filesystem.
+// * from != to.
func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
vfs.mountMu.Lock()
if mntns.mountpoints[from] != 0 {
@@ -252,13 +282,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
// that was replaced by from.
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) {
from.mu.Unlock()
if to != nil {
to.dead = true
to.mu.Unlock()
if to.isMounted() {
- vfs.forgetDeadMountpoint(to)
+ vfs.forgetDeadMountpoint(ctx, to)
}
}
}
@@ -277,7 +307,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
//
// forgetDeadMountpoint is analogous to Linux's
// fs/namespace.c:__detach_mounts().
-func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) {
var (
vdsToDecRef []VirtualDentry
mountsToDecRef []*Mount
@@ -290,9 +320,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
}
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 1e9dffc8f..dde2ad79b 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -22,6 +22,8 @@ import (
)
// DeviceKind indicates whether a device is a block or character device.
+//
+// +stateify savable
type DeviceKind uint32
const (
@@ -44,6 +46,7 @@ func (kind DeviceKind) String() string {
}
}
+// +stateify savable
type devTuple struct {
kind DeviceKind
major uint32
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 8297f964b..a98aac52b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -27,21 +27,24 @@ import (
var epollCycleMu sync.Mutex
// EpollInstance represents an epoll instance, as described by epoll(7).
+//
+// +stateify savable
type EpollInstance struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
DentryMetadataFileDescriptionImpl
+ NoLockFD
// q holds waiters on this EpollInstance.
q waiter.Queue
// interest is the set of file descriptors that are registered with the
// EpollInstance for monitoring. interest is protected by interestMu.
- interestMu sync.Mutex
+ interestMu sync.Mutex `state:"nosave"`
interest map[epollInterestKey]*epollInterest
// mu protects fields in registered epollInterests.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// ready is the set of file descriptors that may be "ready" for I/O. Note
// that this must be an ordered list, not a map: "If more than maxevents
@@ -54,6 +57,7 @@ type EpollInstance struct {
ready epollInterestList
}
+// +stateify savable
type epollInterestKey struct {
// file is the registered FileDescription. No reference is held on file;
// instead, when the last reference is dropped, FileDescription.DecRef()
@@ -66,9 +70,11 @@ type epollInterestKey struct {
}
// epollInterest represents an EpollInstance's interest in a file descriptor.
+//
+// +stateify savable
type epollInterest struct {
// epoll is the owning EpollInstance. epoll is immutable.
- epoll *EpollInstance
+ epoll *EpollInstance `state:"wait"`
// key is the file to which this epollInterest applies. key is immutable.
key epollInterestKey
@@ -92,9 +98,9 @@ type epollInterest struct {
// NewEpollInstanceFD returns a FileDescription representing a new epoll
// instance. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) {
vd := vfs.NewAnonVirtualDentry("[eventpoll]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
ep := &EpollInstance{
interest: make(map[epollInterestKey]*epollInterest),
}
@@ -109,7 +115,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
}
// Release implements FileDescriptionImpl.Release.
-func (ep *EpollInstance) Release() {
+func (ep *EpollInstance) Release(ctx context.Context) {
// Unregister all polled fds.
ep.interestMu.Lock()
defer ep.interestMu.Unlock()
@@ -185,7 +191,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
}
// Register interest in file.
- mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
epi := &epollInterest{
epoll: ep,
key: key,
@@ -256,7 +262,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event
}
// Update epi for the next call to ep.ReadEvents().
- mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
ep.mu.Lock()
epi.mask = mask
epi.userData = event.Data
@@ -330,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) {
ep.mu.Unlock()
}
-// ReadEvents reads up to len(events) ready events into events and returns the
-// number of events read.
-//
-// Preconditions: len(events) != 0.
-func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+// ReadEvents appends up to maxReady events to events and returns the updated
+// slice of events.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
i := 0
// Hot path: avoid defer.
ep.mu.Lock()
@@ -367,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
requeue.PushBack(epi)
}
// Report ievents.
- events[i] = linux.EpollEvent{
+ events = append(events, linux.EpollEvent{
Events: ievents.ToLinux(),
Data: epi.userData,
- }
+ })
i++
- if i == len(events) {
+ if i == maxEvents {
break
}
}
ep.ready.PushBackList(&requeue)
ep.mu.Unlock()
- return i
+ return events
}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index bb294563d..546e445aa 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -37,19 +37,28 @@ import (
// FileDescription methods require that a reference is held.
//
// FileDescription is analogous to Linux's struct file.
+//
+// +stateify savable
type FileDescription struct {
- // refs is the reference count. refs is accessed using atomic memory
- // operations.
- refs int64
+ FileDescriptionRefs
+
+ // flagsMu protects statusFlags and asyncHandler below.
+ flagsMu sync.Mutex `state:"nosave"`
// statusFlags contains status flags, "initialized by open(2) and possibly
- // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
- // memory operations.
+ // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
+ // memory operations when it does not need to be synchronized with an
+ // access to asyncHandler.
statusFlags uint32
+ // asyncHandler handles O_ASYNC signal generation. It is set with the
+ // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
+ // also be set by fcntl(2).
+ asyncHandler FileAsync
+
// epolls is the set of epollInterests registered for this FileDescription.
// epolls is protected by epollMu.
- epollMu sync.Mutex
+ epollMu sync.Mutex `state:"nosave"`
epolls map[*epollInterest]struct{}
// vd is the filesystem location at which this FileDescription was opened.
@@ -73,15 +82,18 @@ type FileDescription struct {
// writable is analogous to Linux's FMODE_WRITE.
writable bool
+ usedLockBSD uint32
+
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
}
// FileDescriptionOptions contains options to FileDescription.Init().
+//
+// +stateify savable
type FileDescriptionOptions struct {
- // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
- // usually only the case if O_DIRECT would actually have an effect.
+ // If AllowDirectIO is true, allow O_DIRECT to be set on the file.
AllowDirectIO bool
// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
@@ -93,7 +105,7 @@ type FileDescriptionOptions struct {
// If UseDentryMetadata is true, calls to FileDescription methods that
// interact with file and filesystem metadata (Stat, SetStat, StatFS,
- // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+ // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
// the corresponding FilesystemImpl methods instead of the corresponding
// FileDescriptionImpl methods.
//
@@ -106,6 +118,10 @@ type FileDescriptionOptions struct {
UseDentryMetadata bool
}
+// FileCreationFlags are the set of flags passed to FileDescription.Init() but
+// omitted from FileDescription.StatusFlags().
+const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
+
// Init must be called before first use of fd. If it succeeds, it takes
// references on mnt and d. flags is the initial file description flags, which
// is usually the full set of flags passed to open(2).
@@ -117,11 +133,11 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
}
}
- fd.refs = 1
+ fd.EnableLeakCheck()
// Remove "file creation flags" to mirror the behavior from file.f_flags in
- // fs/open.c:do_dentry_open
- fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
+ // fs/open.c:do_dentry_open.
+ fd.statusFlags = flags &^ FileCreationFlags
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
@@ -135,30 +151,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
return nil
}
-// IncRef increments fd's reference count.
-func (fd *FileDescription) IncRef() {
- atomic.AddInt64(&fd.refs, 1)
-}
-
-// TryIncRef increments fd's reference count and returns true. If fd's
-// reference count is already zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fd.
-func (fd *FileDescription) TryIncRef() bool {
- for {
- refs := atomic.LoadInt64(&fd.refs)
- if refs <= 0 {
- return false
- }
- if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
- return true
- }
- }
-}
-
// DecRef decrements fd's reference count.
-func (fd *FileDescription) DecRef() {
- if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+func (fd *FileDescription) DecRef(ctx context.Context) {
+ fd.FileDescriptionRefs.DecRef(func() {
// Unregister fd from all epoll instances.
fd.epollMu.Lock()
epolls := fd.epolls
@@ -175,21 +170,25 @@ func (fd *FileDescription) DecRef() {
}
ep.interestMu.Unlock()
}
+
+ // If BSD locks were used, release any lock that it may have acquired.
+ if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
+ fd.impl.UnlockBSD(context.Background(), fd)
+ }
+
// Release implementation resources.
- fd.impl.Release()
+ fd.impl.Release(ctx)
if fd.writable {
fd.vd.mount.EndWrite()
}
- fd.vd.DecRef()
- } else if refs < 0 {
- panic("FileDescription.DecRef() called without holding a reference")
- }
-}
-
-// Refs returns the current number of references. The returned count
-// is inherently racy and is unsafe to use without external synchronization.
-func (fd *FileDescription) Refs() int64 {
- return atomic.LoadInt64(&fd.refs)
+ fd.vd.DecRef(ctx)
+ fd.flagsMu.Lock()
+ if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+ fd.asyncHandler.Unregister(fd)
+ }
+ fd.asyncHandler = nil
+ fd.flagsMu.Unlock()
+ })
}
// Mount returns the mount on which fd was opened. It does not take a reference
@@ -262,9 +261,20 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
return syserror.EINVAL
}
- // TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+ // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()?
const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+ fd.flagsMu.Lock()
+ if fd.asyncHandler != nil {
+ // Use fd.statusFlags instead of oldFlags, which may have become outdated,
+ // to avoid double registering/unregistering.
+ if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
+ fd.asyncHandler.Register(fd)
+ } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
+ fd.asyncHandler.Unregister(fd)
+ }
+ }
atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+ fd.flagsMu.Unlock()
return nil
}
@@ -297,7 +307,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
type FileDescriptionImpl interface {
// Release is called when the associated FileDescription reaches zero
// references.
- Release()
+ Release(ctx context.Context)
// OnClose is called when a file descriptor representing the
// FileDescription is closed. Note that returning a non-nil error does not
@@ -316,6 +326,15 @@ type FileDescriptionImpl interface {
// represented by the FileDescription.
StatFS(ctx context.Context) (linux.Statfs, error)
+ // Allocate grows the file to offset + length bytes.
+ // Only mode == 0 is supported currently.
+ //
+ // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
+ // other files where it is not supported.
+ //
+ // Preconditions: The FileDescription was opened for writing.
+ Allocate(ctx context.Context, mode, offset, length uint64) error
+
// waiter.Waitable methods may be used to poll for I/O events.
waiter.Waitable
@@ -327,8 +346,9 @@ type FileDescriptionImpl interface {
//
// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
//
- // Preconditions: The FileDescription was opened for reading.
- // FileDescriptionOptions.DenyPRead == false.
+ // Preconditions:
+ // * The FileDescription was opened for reading.
+ // * FileDescriptionOptions.DenyPRead == false.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -359,8 +379,9 @@ type FileDescriptionImpl interface {
// - If opts.Flags specifies unsupported options, PWrite returns
// EOPNOTSUPP.
//
- // Preconditions: The FileDescription was opened for writing.
- // FileDescriptionOptions.DenyPWrite == false.
+ // Preconditions:
+ // * The FileDescription was opened for writing.
+ // * FileDescriptionOptions.DenyPWrite == false.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -405,42 +426,36 @@ type FileDescriptionImpl interface {
// Ioctl implements the ioctl(2) syscall.
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
- // Listxattr returns all extended attribute names for the file.
- Listxattr(ctx context.Context, size uint64) ([]string, error)
+ // ListXattr returns all extended attribute names for the file.
+ ListXattr(ctx context.Context, size uint64) ([]string, error)
- // Getxattr returns the value associated with the given extended attribute
+ // GetXattr returns the value associated with the given extended attribute
// for the file.
- Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+ GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
- // Setxattr changes the value associated with the given extended attribute
+ // SetXattr changes the value associated with the given extended attribute
// for the file.
- Setxattr(ctx context.Context, opts SetxattrOptions) error
+ SetXattr(ctx context.Context, opts SetXattrOptions) error
- // Removexattr removes the given extended attribute from the file.
- Removexattr(ctx context.Context, name string) error
+ // RemoveXattr removes the given extended attribute from the file.
+ RemoveXattr(ctx context.Context, name string) error
// LockBSD tries to acquire a BSD-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): BSD-style file locking
LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
- // LockBSD releases a BSD-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ // UnlockBSD releases a BSD-style advisory file lock.
UnlockBSD(ctx context.Context, uid lock.UniqueID) error
// LockPOSIX tries to acquire a POSIX-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): POSIX-style file locking
- LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+ LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error
// UnlockPOSIX releases a POSIX-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): POSIX-style file locking
- UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
+ UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error
}
// Dirent holds the information contained in struct linux_dirent64.
+//
+// +stateify savable
type Dirent struct {
// Name is the filename.
Name string
@@ -467,6 +482,15 @@ type IterDirentsCallback interface {
Handle(dirent Dirent) error
}
+// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
+// the semantics of IterDirentsCallback.Handle.
+type IterDirentsCallbackFunc func(dirent Dirent) error
+
+// Handle implements IterDirentsCallback.Handle.
+func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
+ return f(dirent)
+}
+
// OnClose is called when a file descriptor representing the FileDescription is
// closed. Returning a non-nil error should not prevent the file descriptor
// from being closed.
@@ -483,7 +507,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St
Start: fd.vd,
})
stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return stat, err
}
return fd.impl.Stat(ctx, opts)
@@ -498,7 +522,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
return fd.impl.SetStat(ctx, opts)
@@ -514,23 +538,37 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
Start: fd.vd,
})
statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return statfs, err
}
return fd.impl.StatFS(ctx)
}
-// Readiness returns fd's I/O readiness.
+// Allocate grows file represented by FileDescription to offset + length bytes.
+func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if !fd.IsWritable() {
+ return syserror.EBADF
+ }
+ return fd.impl.Allocate(ctx, mode, offset, length)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// It returns fd's I/O readiness.
func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
return fd.impl.Readiness(mask)
}
-// EventRegister registers e for I/O readiness events in mask.
+// EventRegister implements waiter.Waitable.EventRegister.
+//
+// It registers e for I/O readiness events in mask.
func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.impl.EventRegister(e, mask)
}
-// EventUnregister unregisters e for I/O readiness events.
+// EventUnregister implements waiter.Waitable.EventUnregister.
+//
+// It unregisters e for I/O readiness events.
func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
fd.impl.EventUnregister(e)
}
@@ -605,25 +643,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
return fd.impl.Ioctl(ctx, uio, args)
}
-// Listxattr returns all extended attribute names for the file represented by
+// ListXattr returns all extended attribute names for the file represented by
// fd.
//
// If the size of the list (including a NUL terminating byte after every entry)
// would exceed size, ERANGE may be returned. Note that implementations
// are free to ignore size entirely and return without error). In all cases,
// if size is 0, the list should be returned without error, regardless of size.
-func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
- vfsObj.putResolvingPath(rp)
+ names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
+ vfsObj.putResolvingPath(ctx, rp)
return names, err
}
- names, err := fd.impl.Listxattr(ctx, size)
+ names, err := fd.impl.ListXattr(ctx, size)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -634,57 +672,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
return names, err
}
-// Getxattr returns the value associated with the given extended attribute for
+// GetXattr returns the value associated with the given extended attribute for
// the file represented by fd.
//
// If the size of the return value exceeds opts.Size, ERANGE may be returned
// (note that implementations are free to ignore opts.Size entirely and return
// without error). In all cases, if opts.Size is 0, the value should be
// returned without error, regardless of size.
-func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(rp)
+ val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
+ vfsObj.putResolvingPath(ctx, rp)
return val, err
}
- return fd.impl.Getxattr(ctx, *opts)
+ return fd.impl.GetXattr(ctx, *opts)
}
-// Setxattr changes the value associated with the given extended attribute for
+// SetXattr changes the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(rp)
+ err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
- return fd.impl.Setxattr(ctx, *opts)
+ return fd.impl.SetXattr(ctx, *opts)
}
-// Removexattr removes the given extended attribute from the file represented
+// RemoveXattr removes the given extended attribute from the file represented
// by fd.
-func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
- vfsObj.putResolvingPath(rp)
+ err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
- return fd.impl.Removexattr(ctx, name)
+ return fd.impl.RemoveXattr(ctx, name)
}
// SyncFS instructs the filesystem containing fd to execute the semantics of
@@ -698,7 +736,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
vfsroot := RootFromContext(ctx)
s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
if vfsroot.Ok() {
- vfsroot.DecRef()
+ vfsroot.DecRef(ctx)
}
return s
}
@@ -736,3 +774,95 @@ func (fd *FileDescription) InodeID() uint64 {
func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
return fd.Sync(ctx)
}
+
+// LockBSD tries to acquire a BSD-style advisory file lock.
+func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
+ atomic.StoreUint32(&fd.usedLockBSD, 1)
+ return fd.impl.LockBSD(ctx, fd, lockType, blocker)
+}
+
+// UnlockBSD releases a BSD-style advisory file lock.
+func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
+ return fd.impl.UnlockBSD(ctx, fd)
+}
+
+// LockPOSIX locks a POSIX-style file range lock.
+func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error {
+ return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block)
+}
+
+// UnlockPOSIX unlocks a POSIX-style file range lock.
+func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error {
+ return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence)
+}
+
+// A FileAsync sends signals to its owner when w is ready for IO. This is only
+// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
+// interface to avoid circular dependencies.
+type FileAsync interface {
+ Register(w waiter.Waitable)
+ Unregister(w waiter.Waitable)
+}
+
+// AsyncHandler returns the FileAsync for fd.
+func (fd *FileDescription) AsyncHandler() FileAsync {
+ fd.flagsMu.Lock()
+ defer fd.flagsMu.Unlock()
+ return fd.asyncHandler
+}
+
+// SetAsyncHandler sets fd.asyncHandler if it has not been set before and
+// returns it.
+func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
+ fd.flagsMu.Lock()
+ defer fd.flagsMu.Unlock()
+ if fd.asyncHandler == nil {
+ fd.asyncHandler = newHandler()
+ if fd.statusFlags&linux.O_ASYNC != 0 {
+ fd.asyncHandler.Register(fd)
+ }
+ }
+ return fd.asyncHandler
+}
+
+// FileReadWriteSeeker is a helper struct to pass a FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
+type FileReadWriteSeeker struct {
+ FD *FileDescription
+ Ctx context.Context
+ ROpts ReadOptions
+ WOpts WriteOptions
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+ return int(n), err
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+ return int(n), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+ return f.FD.Seek(f.Ctx, offset, int32(whence))
+}
+
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+ dst := usermem.BytesIOSequence(p)
+ n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+ return int(n), err
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+ buf := usermem.BytesIOSequence(p)
+ n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+ return int(n), err
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index f4c111926..48ca9de44 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,7 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -42,6 +42,8 @@ import (
// FileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
// methods with default behavior analogous to Linux's.
+//
+// +stateify savable
type FileDescriptionDefaultImpl struct{}
// OnClose implements FileDescriptionImpl.OnClose analogously to
@@ -56,6 +58,16 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
return linux.Statfs{}, syserror.ENOSYS
}
+// Allocate implements FileDescriptionImpl.Allocate analogously to
+// fallocate called on an invalid type of file in Linux.
+//
+// Note that directories can rely on this implementation even though they
+// should technically return EISDIR. Allocate should never be called for a
+// directory, because it requires a writable fd.
+func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.ENODEV
+}
+
// Readiness implements waiter.Waitable.Readiness analogously to
// file_operations::poll == NULL in Linux.
func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
@@ -128,56 +140,43 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
return 0, syserror.ENOTTY
}
-// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// ListXattr implements FileDescriptionImpl.ListXattr analogously to
// inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- // This isn't exactly accurate; see FileDescription.Listxattr.
+func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ // This isn't exactly accurate; see FileDescription.ListXattr.
return nil, syserror.ENOTSUP
}
-// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// GetXattr implements FileDescriptionImpl.GetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
return "", syserror.ENOTSUP
}
-// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// SetXattr implements FileDescriptionImpl.SetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
return syserror.ENOTSUP
}
-// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
return syserror.ENOTSUP
}
-// LockBSD implements FileDescriptionImpl.LockBSD.
-func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
- return syserror.EBADF
-}
-
-// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
-func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
- return syserror.EBADF
-}
-
-// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
-func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
- return syserror.EBADF
-}
-
-// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
-func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
- return syserror.EBADF
-}
-
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
+//
+// +stateify savable
type DirectoryFileDescriptionDefaultImpl struct{}
+// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
+func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.EISDIR
+}
+
// PRead implements FileDescriptionImpl.PRead.
func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
return 0, syserror.EISDIR
@@ -201,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
// DentryMetadataFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
// true to obtain implementations of Stat and SetStat that panic.
+//
+// +stateify savable
type DentryMetadataFileDescriptionImpl struct{}
// Stat implements FileDescriptionImpl.Stat.
@@ -215,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
// DynamicBytesSource represents a data source for a
// DynamicBytesFileDescriptionImpl.
+//
+// +stateify savable
type DynamicBytesSource interface {
// Generate writes the file's contents to buf.
Generate(ctx context.Context, buf *bytes.Buffer) error
}
// StaticData implements DynamicBytesSource over a static string.
+//
+// +stateify savable
type StaticData struct {
Data string
}
@@ -247,14 +252,24 @@ type WritableDynamicBytesSource interface {
//
// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
// use.
+//
+// +stateify savable
type DynamicBytesFileDescriptionImpl struct {
data DynamicBytesSource // immutable
- mu sync.Mutex // protects the following fields
- buf bytes.Buffer
+ mu sync.Mutex `state:"nosave"` // protects the following fields
+ buf bytes.Buffer `state:".([]byte)"`
off int64
lastRead int64 // offset at which the last Read, PRead, or Seek ended
}
+func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
+ return fd.buf.Bytes()
+}
+
+func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
+ fd.buf.Write(p)
+}
+
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -347,7 +362,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
writable, ok := fd.data.(WritableDynamicBytesSource)
if !ok {
- return 0, syserror.EINVAL
+ return 0, syserror.EIO
}
n, err := writable.Write(ctx, src, offset)
if err != nil {
@@ -384,3 +399,58 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
fd.IncRef()
return nil
}
+
+// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
+// functions. Caller must call Init().
+//
+// +stateify savable
+type LockFD struct {
+ locks *FileLocks
+}
+
+// Init initializes fd with FileLocks to use.
+func (fd *LockFD) Init(locks *FileLocks) {
+ fd.locks = locks
+}
+
+// Locks returns the locks associated with this file.
+func (fd *LockFD) Locks() *FileLocks {
+ return fd.locks
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ return fd.locks.LockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ fd.locks.UnlockBSD(uid)
+ return nil
+}
+
+// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning ENOLCK.
+//
+// +stateify savable
+type NoLockFD struct{}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ return syserror.ENOLCK
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ return syserror.ENOLCK
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return syserror.ENOLCK
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return syserror.ENOLCK
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 3a75d4d62..1cd607c0a 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -33,6 +33,7 @@ import (
type fileDescription struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
+ NoLockFD
}
// genCount contains the number of times its DynamicBytesSource.Generate()
@@ -79,9 +80,9 @@ type testFD struct {
data DynamicBytesSource
}
-func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
vd := vfsObj.NewAnonVirtualDentry("genCountFD")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
var fd testFD
fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
@@ -89,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS
}
// Release implements FileDescriptionImpl.Release.
-func (fd *testFD) Release() {
+func (fd *testFD) Release(context.Context) {
}
// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
@@ -108,11 +109,11 @@ func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
- fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
- defer fd.DecRef()
+ fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{})
+ defer fd.DecRef(ctx)
// The first read causes Generate to be called to fill the FD's buffer.
buf := make([]byte, 2)
@@ -154,11 +155,11 @@ func TestGenCountFD(t *testing.T) {
}
// Write and PWrite fails.
- if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
- t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
}
- if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
- t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
}
}
@@ -166,11 +167,11 @@ func TestWritable(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
- fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
- defer fd.DecRef()
+ fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"})
+ defer fd.DecRef(ctx)
buf := make([]byte, 10)
ioseq := usermem.BytesIOSequence(buf)
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 1edd584c9..c93d94634 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -15,8 +15,6 @@
package vfs
import (
- "sync/atomic"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
@@ -34,9 +32,7 @@ import (
//
// +stateify savable
type Filesystem struct {
- // refs is the reference count. refs is accessed using atomic memory
- // operations.
- refs int64
+ FilesystemRefs
// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
// immutable.
@@ -52,7 +48,7 @@ type Filesystem struct {
// Init must be called before first use of fs.
func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
- fs.refs = 1
+ fs.EnableLeakCheck()
fs.vfs = vfsObj
fs.fsType = fsType
fs.impl = impl
@@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl {
return fs.impl
}
-// IncRef increments fs' reference count.
-func (fs *Filesystem) IncRef() {
- if atomic.AddInt64(&fs.refs, 1) <= 1 {
- panic("Filesystem.IncRef() called without holding a reference")
- }
-}
-
-// TryIncRef increments fs' reference count and returns true. If fs' reference
-// count is zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fs.
-func (fs *Filesystem) TryIncRef() bool {
- for {
- refs := atomic.LoadInt64(&fs.refs)
- if refs <= 0 {
- return false
- }
- if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
- return true
- }
- }
-}
-
// DecRef decrements fs' reference count.
-func (fs *Filesystem) DecRef() {
- if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+func (fs *Filesystem) DecRef(ctx context.Context) {
+ fs.FilesystemRefs.DecRef(func() {
fs.vfs.filesystemsMu.Lock()
delete(fs.vfs.filesystems, fs)
fs.vfs.filesystemsMu.Unlock()
- fs.impl.Release()
- } else if refs < 0 {
- panic("Filesystem.decRef() called without holding a reference")
- }
+ fs.impl.Release(ctx)
+ })
}
// FilesystemImpl contains implementation details for a Filesystem.
@@ -149,7 +120,7 @@ func (fs *Filesystem) DecRef() {
type FilesystemImpl interface {
// Release is called when the associated Filesystem reaches zero
// references.
- Release()
+ Release(ctx context.Context)
// Sync "causes all pending modifications to filesystem metadata and cached
// file data to be written to the underlying [filesystem]", as by syncfs(2).
@@ -212,8 +183,9 @@ type FilesystemImpl interface {
// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If LinkAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -231,8 +203,9 @@ type FilesystemImpl interface {
// - If the directory in which the new directory would be created has been
// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If MkdirAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -253,8 +226,9 @@ type FilesystemImpl interface {
// - If the directory in which the file would be created has been removed
// by RmdirAt or RenameAt, MknodAt returns ENOENT.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If MknodAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -345,11 +319,12 @@ type FilesystemImpl interface {
// - If renaming would replace a non-empty directory, RenameAt returns
// ENOTEMPTY.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
- // previous call to
- // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
- // not "." or "..".
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
+ // * oldParentVD.Dentry() was obtained from a previous call to
+ // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt().
+ // * oldName is not "." or "..".
//
// Postconditions: If RenameAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -372,8 +347,9 @@ type FilesystemImpl interface {
// - If the file at rp exists but is not a directory, RmdirAt returns
// ENOTDIR.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If RmdirAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -410,8 +386,9 @@ type FilesystemImpl interface {
// - If the directory in which the symbolic link would be created has been
// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If SymlinkAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -431,33 +408,34 @@ type FilesystemImpl interface {
//
// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
//
- // Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink().
+ // Preconditions:
+ // * !rp.Done().
+ // * For the final path component in rp, !rp.ShouldFollowSymlink().
//
// Postconditions: If UnlinkAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
UnlinkAt(ctx context.Context, rp *ResolvingPath) error
- // ListxattrAt returns all extended attribute names for the file at rp.
+ // ListXattrAt returns all extended attribute names for the file at rp.
//
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // ListxattrAt returns ENOTSUP.
+ // ListXattrAt returns ENOTSUP.
//
// - If the size of the list (including a NUL terminating byte after every
// entry) would exceed size, ERANGE may be returned. Note that
// implementations are free to ignore size entirely and return without
// error). In all cases, if size is 0, the list should be returned without
// error, regardless of size.
- ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+ ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
- // GetxattrAt returns the value associated with the given extended
+ // GetXattrAt returns the value associated with the given extended
// attribute for the file at rp.
//
// Errors:
//
- // - If extended attributes are not supported by the filesystem, GetxattrAt
+ // - If extended attributes are not supported by the filesystem, GetXattrAt
// returns ENOTSUP.
//
// - If an extended attribute named opts.Name does not exist, ENODATA is
@@ -467,30 +445,30 @@ type FilesystemImpl interface {
// returned (note that implementations are free to ignore opts.Size entirely
// and return without error). In all cases, if opts.Size is 0, the value
// should be returned without error, regardless of size.
- GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+ GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)
- // SetxattrAt changes the value associated with the given extended
+ // SetXattrAt changes the value associated with the given extended
// attribute for the file at rp.
//
// Errors:
//
- // - If extended attributes are not supported by the filesystem, SetxattrAt
+ // - If extended attributes are not supported by the filesystem, SetXattrAt
// returns ENOTSUP.
//
// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
// ENODATA is returned.
- SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+ SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error
- // RemovexattrAt removes the given extended attribute from the file at rp.
+ // RemoveXattrAt removes the given extended attribute from the file at rp.
//
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // RemovexattrAt returns ENOTSUP.
+ // RemoveXattrAt returns ENOTSUP.
//
// - If name does not exist, ENODATA is returned.
- RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+ RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error
// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
//
@@ -524,12 +502,12 @@ type FilesystemImpl interface {
//
// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
-
- // TODO(gvisor.dev/issue/1479): inotify_add_watch()
}
// PrependPathAtVFSRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+//
+// +stateify savable
type PrependPathAtVFSRootError struct{}
// Error implements error.Error.
@@ -540,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string {
// PrependPathAtNonMountRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter an independent ancestor
// Dentry that is not the Mount root.
+//
+// +stateify savable
type PrependPathAtNonMountRootError struct{}
// Error implements error.Error.
@@ -550,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string {
// PrependPathSyntheticError is returned by implementations of
// FilesystemImpl.PrependPath() for which prepended names do not represent real
// paths.
+//
+// +stateify savable
type PrependPathSyntheticError struct{}
// Error implements error.Error.
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..2620cf975 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,9 @@ package vfs
import (
"strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string {
}
return m
}
+
+// GenericStatFS returns a statfs struct filled with the common fields for a
+// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs().
+func GenericStatFS(fsMagic uint64) linux.Statfs {
+ return linux.Statfs{
+ Type: fsMagic,
+ BlockSize: usermem.PageSize,
+ NameLength: linux.NAME_MAX,
+ }
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f2298f7f6..9d54cc4ed 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -33,6 +33,9 @@ type FilesystemType interface {
// Name returns the name of this FilesystemType.
Name() string
+
+ // Release releases all resources held by this FilesystemType.
+ Release(ctx context.Context)
}
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
@@ -55,10 +58,13 @@ type registeredFilesystemType struct {
// RegisterFilesystemTypeOptions contains options to
// VirtualFilesystem.RegisterFilesystem().
+//
+// +stateify savable
type RegisterFilesystemTypeOptions struct {
- // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
- // for which MountOptions.InternalMount == false to use this filesystem
- // type.
+ // AllowUserMount determines whether users are allowed to mount a file system
+ // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
+ // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false
+ // to use this filesystem type.
AllowUserMount bool
// If AllowUserList is true, make this filesystem type visible in
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
new file mode 100644
index 000000000..833db213f
--- /dev/null
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -0,0 +1,210 @@
+# Inotify
+
+Inotify is a mechanism for monitoring filesystem events in Linux--see
+inotify(7). An inotify instance can be used to monitor files and directories for
+modifications, creation/deletion, etc. The inotify API consists of system calls
+that create inotify instances (inotify_init/inotify_init1) and add/remove
+watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are
+generated from various places in the sentry, including the syscall layer, the
+vfs layer, the process fd table, and within each filesystem implementation. This
+document outlines the implementation details of inotify in VFS2.
+
+## Inotify Objects
+
+Inotify data structures are implemented in the vfs package.
+
+### vfs.Inotify
+
+Inotify instances are represented by vfs.Inotify objects, which implement
+vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a
+pseudo-filesystem (anonfs). Each inotify instance receives events from a set of
+vfs.Watch objects, which can be modified with inotify_add_watch(2) and
+inotify_rm_watch(2). An application can retrieve events by reading the inotify
+fd.
+
+### vfs.Watches
+
+The set of all watches held on a single file (i.e., the watch target) is stored
+in vfs.Watches. Each watch will belong to a different inotify instance (an
+instance can only have one watch on any watch target). The watches are stored in
+a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
+to a single file will all share the same vfs.Watches (with the exception of the
+gofer filesystem, described in a later section). Activity on the target causes
+its vfs.Watches to generate notifications on its watches’ inotify instances.
+
+### vfs.Watch
+
+A single watch, owned by one inotify instance and applied to one watch target.
+Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch,
+which leads to some complicated locking behavior (see Lock Ordering). Whenever a
+watch is notified of an event on its target, it will queue events to its inotify
+instance for delivery to the user.
+
+### vfs.Event
+
+vfs.Event is a simple struct encapsulating all the fields for an inotify event.
+It is generated by vfs.Watches and forwarded to the watches' owners. It is
+serialized to the user during read(2) syscalls on the associated fs.Inotify's
+fd.
+
+## Lock Ordering
+
+There are three locks related to the inotify implementation:
+
+Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue
+lock. Watches.mu: the watch set lock, used to protect the collection of watches
+on a target.
+
+The correct lock ordering for inotify code is:
+
+Inotify.mu -> Watches.mu -> Inotify.evMu.
+
+Note that we use a distinct lock to protect the inotify event queue. If we
+simply used Inotify.mu, we could simultaneously have locks being acquired in the
+order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would
+cause deadlocks. For instance, adding a watch to an inotify instance would
+require locking Inotify.mu, and then adding the same watch to the target would
+cause Watches.mu to be held. At the same time, generating an event on the target
+would require Watches.mu to be held before iterating through each watch, and
+then notifying the owner of each watch would cause Inotify.mu to be held.
+
+See the vfs package comment to understand how inotify locks fit into the overall
+ordering of filesystem locks.
+
+## Watch Targets in Different Filesystem Implementations
+
+In Linux, watches reside on inodes at the virtual filesystem layer. As a result,
+all hard links and file descriptions on a single file will all share the same
+watch set. In VFS2, there is no common inode structure across filesystem types
+(some may not even have inodes), so we have to plumb inotify support through
+each specific filesystem implementation. Some of the technical considerations
+are outlined below.
+
+### Tmpfs
+
+For filesystems with inodes, like tmpfs, the design is quite similar to that of
+Linux, where watches reside on the inode.
+
+### Pseudo-filesystems
+
+Technically, because inotify is implemented at the vfs layer in Linux,
+pseudo-filesystems on top of kernfs support inotify passively. However, watches
+can only track explicit filesystem operations like read/write, open/close,
+mknod, etc., so watches on a target like /proc/self/fd will not generate events
+every time a new fd is added or removed. As of this writing, we leave inotify
+unimplemented in kernfs and anonfs; it does not seem particularly useful.
+
+### Gofer Filesystem (fsimpl/gofer)
+
+The gofer filesystem has several traits that make it difficult to support
+inotify:
+
+* **There are no inodes.** A file is represented as a dentry that holds an
+ unopened p9 file (and possibly an open FID), through which the Sentry
+ interacts with the gofer.
+ * *Solution:* Because there is no inode structure stored in the sandbox,
+ inotify watches must be held on the dentry. For the purposes of inotify,
+ we assume that every dentry corresponds to a unique inode, which may
+ cause unexpected behavior in the presence of hard links, where multiple
+ dentries should share the same set of watches. Indeed, it is impossible
+ for us to be absolutely sure whether dentries correspond to the same
+ file or not, due to the following point:
+* **The Sentry cannot always be aware of hard links on the remote
+ filesystem.** There is no way for us to confirm whether two files on the
+ remote filesystem are actually links to the same inode. QIDs and inodes are
+ not always 1:1. The assumption that dentries and inodes are 1:1 is
+ inevitably broken if there are remote hard links that we cannot detect.
+ * *Solution:* this is an issue with gofer fs in general, not only inotify,
+ and we will have to live with it.
+* **Dentries can be cached, and then evicted.** Dentry lifetime does not
+ correspond to file lifetime. Because gofer fs is not entirely in-memory, the
+ absence of a dentry does not mean that the corresponding file does not
+ exist, nor does a dentry reaching zero references mean that the
+ corresponding file no longer exists. When a dentry reaches zero references,
+ it will be cached, in case the file at that path is needed again in the
+ future. However, the dentry may be evicted from the cache, which will cause
+ a new dentry to be created next time the same file path is used. The
+ existing watches will be lost.
+ * *Solution:* When a dentry reaches zero references, do not cache it if it
+ has any watches, so we can avoid eviction/destruction. Note that if the
+ dentry was deleted or invalidated (d.vfsd.IsDead()), we should still
+ destroy it along with its watches. Additionally, when a dentry’s last
+ watch is removed, we cache it if it also has zero references. This way,
+ the dentry can eventually be evicted from memory if it is no longer
+ needed.
+* **Dentries can be invalidated.** Another issue with dentry lifetime is that
+ the remote file at the file path represented may change from underneath the
+ dentry. In this case, the next time that the dentry is used, it will be
+ invalidated and a new dentry will replace it. In this case, it is not clear
+ what should be done with the watches on the old dentry.
+ * *Solution:* Silently destroy the watches when invalidation occurs. We
+ have no way of knowing exactly what happened, when it happens. Inotify
+ instances on NFS files in Linux probably behave in a similar fashion,
+ since inotify is implemented at the vfs layer and is not aware of the
+ complexities of remote file systems.
+ * An alternative would be to issue some kind of event upon invalidation,
+ e.g. a delete event, but this has several issues:
+ * We cannot discern whether the remote file was invalidated because it was
+ moved, deleted, etc. This information is crucial, because these cases
+ should result in different events. Furthermore, the watches should only
+ be destroyed if the file has been deleted.
+ * Moreover, the mechanism for detecting whether the underlying file has
+ changed is to check whether a new QID is given by the gofer. This may
+ result in false positives, e.g. suppose that the server closed and
+ re-opened the same file, which may result in a new QID.
+ * Finally, the time of the event may be completely different from the time
+ of the file modification, since a dentry is not immediately notified
+ when the underlying file has changed. It would be quite unexpected to
+ receive the notification when invalidation was triggered, i.e. the next
+ time the file was accessed within the sandbox, because then the
+ read/write/etc. operation on the file would not result in the expected
+ event.
+ * Another point in favor of the first solution: inotify in Linux can
+ already be lossy on local filesystems (one of the sacrifices made so
+ that filesystem performance isn’t killed), and it is lossy on NFS for
+ similar reasons to gofer fs. Therefore, it is better for inotify to be
+ silent than to emit incorrect notifications.
+* **There may be external users of the remote filesystem.** We can only track
+ operations performed on the file within the sandbox. This is sufficient
+ under InteropModeExclusive, but whenever there are external users, the set
+ of actions we are aware of is incomplete.
+ * *Solution:* We could either return an error or just issue a warning when
+ inotify is used without InteropModeExclusive. Although faulty, VFS1
+ allows it when the filesystem is shared, and Linux does the same for
+ remote filesystems (as mentioned above, inotify sits at the vfs level).
+
+## Dentry Interface
+
+For events that must be generated above the vfs layer, we provide the following
+DentryImpl methods to allow interactions with targets on any FilesystemImpl:
+
+* **InotifyWithParent()** generates events on the dentry’s watches as well as
+ its parent’s.
+* **Watches()** retrieves the watch set of the target represented by the
+ dentry. This is used to access and modify watches on a target.
+* **OnZeroWatches()** performs cleanup tasks after the last watch is removed
+ from a dentry. This is needed by gofer fs, which must allow a watched dentry
+ to be cached once it has no more watches. Most implementations can just do
+ nothing. Note that OnZeroWatches() must be called after all inotify locks
+ are released to preserve lock ordering, since it may acquire
+ FilesystemImpl-specific locks.
+
+## IN_EXCL_UNLINK
+
+There are several options that can be set for a watch, specified as part of the
+mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some
+additional support in each filesystem.
+
+A watch with IN_EXCL_UNLINK will not generate events for its target if it
+corresponds to a path that was unlinked. For instance, if an fd is opened on
+“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the
+fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This
+requires each DentryImpl to keep track of whether it has been unlinked, in order
+to determine whether events should be sent to watches with IN_EXCL_UNLINK.
+
+## IN_ONESHOT
+
+One-shot watches expire after generating a single event. When an event occurs,
+all one-shot watches on the target that successfully generated an event are
+removed. Lock ordering can cause the management of one-shot watches to be quite
+expensive; see Watches.Notify() for more information.
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 286510195..ba6e6ed49 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -27,6 +27,8 @@ import (
)
// Dentry is a required type parameter that is a struct with the given fields.
+//
+// +stateify savable
type Dentry struct {
// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
vfsd vfs.Dentry
@@ -43,7 +45,7 @@ type Dentry struct {
// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
// either d2's parent or an ancestor of d2's parent.
func IsAncestorDentry(d, d2 *Dentry) bool {
- for {
+ for d2 != nil { // Stop at root, where d2.parent == nil.
if d2.parent == d {
return true
}
@@ -52,6 +54,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool {
}
d2 = d2.parent
}
+ return false
}
// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
@@ -68,7 +71,7 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath
if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
return vfs.PrependPathAtVFSRootError{}
}
- if &d.vfsd == mnt.Root() {
+ if mnt != nil && &d.vfsd == mnt.Root() {
return nil
}
if d.parent == nil {
@@ -78,3 +81,12 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath
d = d.parent
}
}
+
+// DebugPathname returns a pathname to d relative to its filesystem root.
+// DebugPathname does not correspond to any Linux function; it's used to
+// generate dentry pathnames for debugging.
+func DebugPathname(d *Dentry) string {
+ var b fspath.Builder
+ _ = PrependPath(vfs.VirtualDentry{}, nil, d, &b)
+ return b.String()
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index 05a3051a4..3f0b8f45b 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16
//
// The way events are labelled appears somewhat arbitrary, but they must match
// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+//
+// +stateify savable
type EventType uint8
// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
@@ -49,14 +51,12 @@ const (
// Inotify represents an inotify instance created by inotify_init(2) or
// inotify_init1(2). Inotify implements FileDescriptionImpl.
//
-// Lock ordering:
-// Inotify.mu -> Watches.mu -> Inotify.evMu
-//
// +stateify savable
type Inotify struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
DentryMetadataFileDescriptionImpl
+ NoLockFD
// Unique identifier for this inotify instance. We don't just reuse the
// inotify fd because fds can be duped. These should not be exposed to the
@@ -102,7 +102,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32)
id := uniqueid.GlobalFromContext(ctx)
vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
fd := &Inotify{
id: id,
scratch: make([]byte, inotifyEventBaseSize),
@@ -120,19 +120,39 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32)
// Release implements FileDescriptionImpl.Release. Release removes all
// watches and frees all resources for an inotify instance.
-func (i *Inotify) Release() {
+func (i *Inotify) Release(ctx context.Context) {
+ var ds []*Dentry
+
// We need to hold i.mu to avoid a race with concurrent calls to
// Inotify.handleDeletion from Watches. There's no risk of Watches
// accessing this Inotify after the destructor ends, because we remove all
// references to it below.
i.mu.Lock()
- defer i.mu.Unlock()
for _, w := range i.watches {
// Remove references to the watch from the watches set on the target. We
// don't need to worry about the references from i.watches, since this
// file description is about to be destroyed.
- w.set.Remove(i.id)
+ d := w.target
+ ws := d.Watches()
+ // Watchable dentries should never return a nil watch set.
+ if ws == nil {
+ panic("Cannot remove watch from an unwatchable dentry")
+ }
+ ws.Remove(i.id)
+ if ws.Size() == 0 {
+ ds = append(ds, d)
+ }
}
+ i.mu.Unlock()
+
+ for _, d := range ds {
+ d.OnZeroWatches(ctx)
+ }
+}
+
+// Allocate implements FileDescription.Allocate.
+func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ panic("Allocate should not be called on read-only inotify fds")
}
// EventRegister implements waiter.Waitable.
@@ -161,12 +181,12 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
return mask & ready
}
-// PRead implements FileDescriptionImpl.
+// PRead implements FileDescriptionImpl.PRead.
func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
return 0, syserror.ESPIPE
}
-// PWrite implements FileDescriptionImpl.
+// PWrite implements FileDescriptionImpl.PWrite.
func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
return 0, syserror.ESPIPE
}
@@ -225,7 +245,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt
return writeLen, nil
}
-// Ioctl implements fs.FileOperations.Ioctl.
+// Ioctl implements FileDescriptionImpl.Ioctl.
func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
switch args[1].Int() {
case linux.FIONREAD:
@@ -271,20 +291,19 @@ func (i *Inotify) queueEvent(ev *Event) {
// newWatchLocked creates and adds a new watch to target.
//
-// Precondition: i.mu must be locked.
-func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
- targetWatches := target.Watches()
+// Precondition: i.mu must be locked. ws must be the watch set for target d.
+func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch {
w := &Watch{
- owner: i,
- wd: i.nextWatchIDLocked(),
- set: targetWatches,
- mask: mask,
+ owner: i,
+ wd: i.nextWatchIDLocked(),
+ target: d,
+ mask: mask,
}
// Hold the watch in this inotify instance as well as the watch set on the
// target.
i.watches[w.wd] = w
- targetWatches.Add(w)
+ ws.Add(w)
return w
}
@@ -296,22 +315,11 @@ func (i *Inotify) nextWatchIDLocked() int32 {
return i.nextWatchMinusOne
}
-// handleDeletion handles the deletion of the target of watch w. It removes w
-// from i.watches and a watch removal event is generated.
-func (i *Inotify) handleDeletion(w *Watch) {
- i.mu.Lock()
- _, found := i.watches[w.wd]
- delete(i.watches, w.wd)
- i.mu.Unlock()
-
- if found {
- i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
- }
-}
-
// AddWatch constructs a new inotify watch and adds it to the target. It
// returns the watch descriptor returned by inotify_add_watch(2).
-func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+//
+// The caller must hold a reference on target.
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
// Note: Locking this inotify instance protects the result returned by
// Lookup() below. With the lock held, we know for sure the lookup result
// won't become stale because it's impossible for *this* instance to
@@ -319,8 +327,14 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
i.mu.Lock()
defer i.mu.Unlock()
+ ws := target.Watches()
+ if ws == nil {
+ // While Linux supports inotify watches on all filesystem types, watches on
+ // filesystems like kernfs are not generally useful, so we do not.
+ return 0, syserror.EPERM
+ }
// Does the target already have a watch from this inotify instance?
- if existing := target.Watches().Lookup(i.id); existing != nil {
+ if existing := ws.Lookup(i.id); existing != nil {
newmask := mask
if mask&linux.IN_MASK_ADD != 0 {
// "Add (OR) events to watch mask for this pathname if it already
@@ -328,17 +342,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
newmask |= atomic.LoadUint32(&existing.mask)
}
atomic.StoreUint32(&existing.mask, newmask)
- return existing.wd
+ return existing.wd, nil
}
// No existing watch, create a new watch.
- w := i.newWatchLocked(target, mask)
- return w.wd
+ w := i.newWatchLocked(target, ws, mask)
+ return w.wd, nil
}
// RmWatch looks up an inotify watch for the given 'wd' and configures the
// target to stop sending events to this inotify instance.
-func (i *Inotify) RmWatch(wd int32) error {
+func (i *Inotify) RmWatch(ctx context.Context, wd int32) error {
i.mu.Lock()
// Find the watch we were asked to removed.
@@ -352,9 +366,19 @@ func (i *Inotify) RmWatch(wd int32) error {
delete(i.watches, wd)
// Remove the watch from the watch target.
- w.set.Remove(w.OwnerID())
+ ws := w.target.Watches()
+ // AddWatch ensures that w.target has a non-nil watch set.
+ if ws == nil {
+ panic("Watched dentry cannot have nil watch set")
+ }
+ ws.Remove(w.OwnerID())
+ remaining := ws.Size()
i.mu.Unlock()
+ if remaining == 0 {
+ w.target.OnZeroWatches(ctx)
+ }
+
// Generate the event for the removal.
i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
@@ -373,6 +397,13 @@ type Watches struct {
ws map[uint64]*Watch
}
+// Size returns the number of watches held by w.
+func (w *Watches) Size() int {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return len(w.ws)
+}
+
// Lookup returns the watch owned by an inotify instance with the given id.
// Returns nil if no such watch exists.
//
@@ -423,64 +454,86 @@ func (w *Watches) Remove(id uint64) {
return
}
- if _, ok := w.ws[id]; !ok {
- // While there's technically no problem with silently ignoring a missing
- // watch, this is almost certainly a bug.
- panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+ // It is possible for w.Remove() to be called for the same watch multiple
+ // times. See the treatment of one-shot watches in Watches.Notify().
+ if _, ok := w.ws[id]; ok {
+ delete(w.ws, id)
}
- delete(w.ws, id)
}
-// Notify queues a new event with all watches in this set.
-func (w *Watches) Notify(name string, events, cookie uint32, et EventType) {
- w.NotifyWithExclusions(name, events, cookie, et, false)
+// Notify queues a new event with watches in this set. Watches with
+// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
+// unlinked.
+func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) {
+ var hasExpired bool
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
+ continue
+ }
+ if watch.Notify(name, events, cookie) {
+ hasExpired = true
+ }
+ }
+ w.mu.RUnlock()
+
+ if hasExpired {
+ w.cleanupExpiredWatches(ctx)
+ }
}
-// NotifyWithExclusions queues a new event with watches in this set. Watches
-// with IN_EXCL_UNLINK are skipped if the event is coming from a child that
-// has been unlinked.
-func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) {
- // N.B. We don't defer the unlocks because Notify is in the hot path of
- // all IO operations, and the defer costs too much for small IO
- // operations.
+// This function is relatively expensive and should only be called where there
+// are expired watches.
+func (w *Watches) cleanupExpiredWatches(ctx context.Context) {
+ // Because of lock ordering, we cannot acquire Inotify.mu for each watch
+ // owner while holding w.mu. As a result, store expired watches locally
+ // before removing.
+ var toRemove []*Watch
w.mu.RLock()
for _, watch := range w.ws {
- if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent {
- continue
+ if atomic.LoadInt32(&watch.expired) == 1 {
+ toRemove = append(toRemove, watch)
}
- watch.Notify(name, events, cookie)
}
w.mu.RUnlock()
+ for _, watch := range toRemove {
+ watch.owner.RmWatch(ctx, watch.wd)
+ }
}
-// HandleDeletion is called when the watch target is destroyed to emit
-// the appropriate events.
-func (w *Watches) HandleDeletion() {
- w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent)
+// HandleDeletion is called when the watch target is destroyed. Clear the
+// watch set, detach watches from the inotify instances they belong to, and
+// generate the appropriate events.
+func (w *Watches) HandleDeletion(ctx context.Context) {
+ w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)
- // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
- // by value. Ideally, we wouldn't have this circular locking so we can just
- // notify of IN_DELETE_SELF in the same loop below.
- //
- // We can't hold w.mu while calling watch.handleDeletion to preserve lock
- // ordering w.r.t to the owner inotify instances. Instead, atomically move
- // the watches map into a local variable so we can iterate over it safely.
- //
- // Because of this however, it is possible for the watches' owners to reach
- // this inode while the inode has no refs. This is still safe because the
- // owners can only reach the inode until this function finishes calling
- // watch.handleDeletion below and the inode is guaranteed to exist in the
- // meantime. But we still have to be very careful not to rely on inode state
- // that may have been already destroyed.
+ // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
+ // the owner of each watch being deleted. Instead, atomically store the
+ // watches map in a local variable and set it to nil so we can iterate over
+ // it with the assurance that there will be no concurrent accesses.
var ws map[uint64]*Watch
w.mu.Lock()
ws = w.ws
w.ws = nil
w.mu.Unlock()
+ // Remove each watch from its owner's watch set, and generate a corresponding
+ // watch removal event.
for _, watch := range ws {
- // TODO(gvisor.dev/issue/1479): consider refactoring this.
- watch.handleDeletion()
+ i := watch.owner
+ i.mu.Lock()
+ _, found := i.watches[watch.wd]
+ delete(i.watches, watch.wd)
+
+ // Release mutex before notifying waiters because we don't control what
+ // they can do.
+ i.mu.Unlock()
+
+ // If watch was not found, it was removed from the inotify instance before
+ // we could get to it, in which case we should not generate an event.
+ if found {
+ i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
+ }
}
}
@@ -489,18 +542,28 @@ func (w *Watches) HandleDeletion() {
// +stateify savable
type Watch struct {
// Inotify instance which owns this watch.
+ //
+ // This field is immutable after creation.
owner *Inotify
// Descriptor for this watch. This is unique across an inotify instance.
+ //
+ // This field is immutable after creation.
wd int32
- // set is the watch set containing this watch. It belongs to the target file
- // of this watch.
- set *Watches
+ // target is a dentry representing the watch target. Its watch set contains this watch.
+ //
+ // This field is immutable after creation.
+ target *Dentry
// Events being monitored via this watch. Must be accessed with atomic
// memory operations.
mask uint32
+
+ // expired is set to 1 to indicate that this watch is a one-shot that has
+ // already sent a notification and therefore can be removed. Must be accessed
+ // with atomic memory operations.
+ expired int32
}
// OwnerID returns the id of the inotify instance that owns this watch.
@@ -508,23 +571,29 @@ func (w *Watch) OwnerID() uint64 {
return w.owner.id
}
-// ExcludeUnlinkedChildren indicates whether the watched object should continue
-// to be notified of events of its children after they have been unlinked, e.g.
-// for an open file descriptor.
+// ExcludeUnlinked indicates whether the watched object should continue to be
+// notified of events originating from a path that has been unlinked.
//
-// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
-// We can do this by keeping track of the set of unlinked children in Watches
-// to skip notification.
-func (w *Watch) ExcludeUnlinkedChildren() bool {
+// For example, if "foo/bar" is opened and then unlinked, operations on the
+// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK.
+func (w *Watch) ExcludeUnlinked() bool {
return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
}
-// Notify queues a new event on this watch.
-func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+// Notify queues a new event on this watch. Returns true if this is a one-shot
+// watch that should be deleted, after this event was successfully queued.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
+ if atomic.LoadInt32(&w.expired) == 1 {
+ // This is a one-shot watch that is already in the process of being
+ // removed. This may happen if a second event reaches the watch target
+ // before this watch has been removed.
+ return false
+ }
+
mask := atomic.LoadUint32(&w.mask)
if mask&events == 0 {
// We weren't watching for this event.
- return
+ return false
}
// Event mask should include bits matched from the watch plus all control
@@ -533,11 +602,11 @@ func (w *Watch) Notify(name string, events uint32, cookie uint32) {
effectiveMask := unmaskableBits | mask
matchedEvents := effectiveMask & events
w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
-}
-
-// handleDeletion handles the deletion of w's target.
-func (w *Watch) handleDeletion() {
- w.owner.handleDeletion(w)
+ if mask&linux.IN_ONESHOT != 0 {
+ atomic.StoreInt32(&w.expired, 1)
+ return true
+ }
+ return false
}
// Event represents a struct inotify_event from linux.
@@ -605,7 +674,7 @@ func (e *Event) setName(name string) {
func (e *Event) sizeOf() int {
s := inotifyEventBaseSize + int(e.len)
if s < inotifyEventBaseSize {
- panic("overflow")
+ panic("Overflowed event size")
}
return s
}
@@ -675,11 +744,15 @@ func InotifyEventFromStatMask(mask uint32) uint32 {
}
// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
-// the child being removed and its parent.
-func InotifyRemoveChild(self, parent *Watches, name string) {
- self.Notify("", linux.IN_ATTRIB, 0, InodeEvent)
- parent.Notify(name, linux.IN_DELETE, 0, InodeEvent)
- // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+// the child being removed and its parent. Note that unlike most pairs of
+// parent/child notifications, the child is notified first in this case.
+func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) {
+ if self != nil {
+ self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */)
+ }
+ if parent != nil {
+ parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */)
+ }
}
// InotifyRename sends the appriopriate notifications to the watch sets of the
@@ -690,8 +763,14 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches,
dirEv = linux.IN_ISDIR
}
cookie := uniqueid.InotifyCookie(ctx)
- oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent)
- newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent)
+ if oldParent != nil {
+ oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */)
+ }
+ if newParent != nil {
+ newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */)
+ }
// Somewhat surprisingly, self move events do not have a cookie.
- renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent)
+ if renamed != nil {
+ renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */)
+ }
}
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock.go
index 724dfe743..1ff202f2a 100644
--- a/pkg/sentry/vfs/lock/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -12,14 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package lock provides POSIX and BSD style file locking for VFS2 file
-// implementations.
-//
-// The actual implementations can be found in the lock package under
-// sentry/fs/lock.
-package lock
+package vfs
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,6 +28,8 @@ import (
// Note that in Linux these two types of locks are _not_ cooperative, because
// race and deadlock conditions make merging them prohibitive. We do the same
// and keep them oblivious to each other.
+//
+// +stateify savable
type FileLocks struct {
// bsd is a set of BSD-style advisory file wide locks, see flock(2).
bsd fslock.Locks
@@ -44,7 +43,13 @@ func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fsloc
if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
return nil
}
- return syserror.ErrWouldBlock
+
+ // Return an appropriate error for the unsuccessful lock attempt, depending on
+ // whether this is a blocking or non-blocking operation.
+ if block == nil {
+ return syserror.ErrWouldBlock
+ }
+ return syserror.ERESTARTSYS
}
// UnlockBSD releases a BSD-style lock on the entire file.
@@ -56,17 +61,58 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
}
// LockPOSIX tries to acquire a POSIX-style lock on a file region.
-func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ rng, err := computeRange(ctx, fd, start, length, whence)
+ if err != nil {
+ return err
+ }
if fl.posix.LockRegion(uid, t, rng, block) {
return nil
}
- return syserror.ErrWouldBlock
+
+ // Return an appropriate error for the unsuccessful lock attempt, depending on
+ // whether this is a blocking or non-blocking operation.
+ if block == nil {
+ return syserror.ErrWouldBlock
+ }
+ return syserror.ERESTARTSYS
}
// UnlockPOSIX releases a POSIX-style lock on a file region.
//
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
-func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ rng, err := computeRange(ctx, fd, start, length, whence)
+ if err != nil {
+ return err
+ }
fl.posix.UnlockRegion(uid, rng)
+ return nil
+}
+
+func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) {
+ var off int64
+ switch whence {
+ case linux.SEEK_SET:
+ off = 0
+ case linux.SEEK_CUR:
+ // Note that Linux does not hold any mutexes while retrieving the file
+ // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+ curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
+ if err != nil {
+ return fslock.LockRange{}, err
+ }
+ off = curOff
+ case linux.SEEK_END:
+ stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
+ if err != nil {
+ return fslock.LockRange{}, err
+ }
+ off = int64(stat.Size)
+ default:
+ return fslock.LockRange{}, syserror.EINVAL
+ }
+
+ return fslock.ComputeRange(int64(start), int64(length), off)
}
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
deleted file mode 100644
index d9ab063b7..000000000
--- a/pkg/sentry/vfs/lock/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
- name = "lock",
- srcs = ["lock.go"],
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/sentry/fs/lock",
- "//pkg/syserror",
- ],
-)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
index cc1e7d764..638b5d830 100644
--- a/pkg/sentry/vfs/memxattr/xattr.go
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct {
xattrs map[string]string
}
-// Getxattr returns the value at 'name'.
-func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+// GetXattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) {
x.mu.RLock()
value, ok := x.xattrs[opts.Name]
x.mu.RUnlock()
@@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string,
return value, nil
}
-// Setxattr sets 'value' at 'name'.
-func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+// SetXattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error {
x.mu.Lock()
defer x.mu.Unlock()
if x.xattrs == nil {
@@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
return nil
}
-// Listxattr returns all names in xattrs.
-func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+// ListXattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) {
// Keep track of the size of the buffer needed in listxattr(2) for the list.
listSize := 0
x.mu.RLock()
@@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
return names, nil
}
-// Removexattr removes the xattr at 'name'.
-func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+// RemoveXattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) RemoveXattr(name string) error {
x.mu.Lock()
defer x.mu.Unlock()
if _, ok := x.xattrs[name]; !ok {
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index e4ac6524b..d452d2cda 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -46,8 +47,9 @@ import (
// +stateify savable
type Mount struct {
// vfs, fs, root are immutable. References are held on fs and root.
+ // Note that for a disconnected mount, root may be nil.
//
- // Invariant: root belongs to fs.
+ // Invariant: if not nil, root belongs to fs.
vfs *VirtualFilesystem
fs *Filesystem
root *Dentry
@@ -55,13 +57,17 @@ type Mount struct {
// ID is the immutable mount ID.
ID uint64
+ // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+ // for MS_RDONLY which is tracked in "writers". Immutable.
+ Flags MountFlags
+
// key is protected by VirtualFilesystem.mountMu and
// VirtualFilesystem.mounts.seq, and may be nil. References are held on
// key.parent and key.point if they are not nil.
//
// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
// key.parent.fs.
- key mountKey
+ key mountKey `state:".(VirtualDentry)"`
// ns is the namespace in which this Mount was mounted. ns is protected by
// VirtualFilesystem.mountMu.
@@ -81,10 +87,6 @@ type Mount struct {
// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
umounted bool
- // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
- // for MS_RDONLY which is tracked in "writers".
- flags MountFlags
-
// The lower 63 bits of writers is the number of calls to
// Mount.CheckBeginWrite() that have not yet been paired with a call to
// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
@@ -95,16 +97,19 @@ type Mount struct {
func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
mnt := &Mount{
ID: atomic.AddUint64(&vfs.lastMountID, 1),
+ Flags: opts.Flags,
vfs: vfs,
fs: fs,
root: root,
- flags: opts.Flags,
ns: mntns,
refs: 1,
}
if opts.ReadOnly {
mnt.setReadOnlyLocked(true)
}
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Register(mnt, "vfs.Mount")
+ }
return mnt
}
@@ -113,13 +118,12 @@ func (mnt *Mount) Options() MountOptions {
mnt.vfs.mountMu.Lock()
defer mnt.vfs.mountMu.Unlock()
return MountOptions{
- Flags: mnt.flags,
- ReadOnly: mnt.readOnly(),
+ Flags: mnt.Flags,
+ ReadOnly: mnt.ReadOnly(),
}
}
-// A MountNamespace is a collection of Mounts.
-//
+// A MountNamespace is a collection of Mounts.//
// MountNamespaces are reference-counted. Unless otherwise specified, all
// MountNamespace methods require that a reference is held.
//
@@ -127,13 +131,14 @@ func (mnt *Mount) Options() MountOptions {
//
// +stateify savable
type MountNamespace struct {
+ MountNamespaceRefs
+
+ // Owner is the usernamespace that owns this mount namespace.
+ Owner *auth.UserNamespace
+
// root is the MountNamespace's root mount. root is immutable.
root *Mount
- // refs is the reference count. refs is accessed using atomic memory
- // operations.
- refs int64
-
// mountpoints maps all Dentries which are mount points in this namespace
// to the number of Mounts for which they are mount points. mountpoints is
// protected by VirtualFilesystem.mountMu.
@@ -152,21 +157,22 @@ type MountNamespace struct {
// NewMountNamespace returns a new mount namespace with a root filesystem
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
return nil, syserror.ENODEV
}
- fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
return nil, err
}
mntns := &MountNamespace{
- refs: 1,
+ Owner: creds.UserNamespace,
mountpoints: make(map[*Dentry]uint32),
}
- mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+ mntns.EnableLeakCheck()
+ mntns.root = newMount(vfs, fs, root, mntns, opts)
return mntns, nil
}
@@ -197,8 +203,8 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth
if err != nil {
return nil, err
}
- defer root.DecRef()
- defer fs.DecRef()
+ defer root.DecRef(ctx)
+ defer fs.DecRef(ctx)
return vfs.NewDisconnectedMount(fs, root, opts)
}
@@ -218,7 +224,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
if vd.dentry.dead {
vd.dentry.mu.Unlock()
vfs.mountMu.Unlock()
- vd.DecRef()
+ vd.DecRef(ctx)
return syserror.ENOENT
}
// vd might have been mounted over between vfs.GetDentryAt() and
@@ -240,7 +246,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
// This can't fail since we're holding vfs.mountMu.
nextmnt.root.IncRef()
vd.dentry.mu.Unlock()
- vd.DecRef()
+ vd.DecRef(ctx)
vd = VirtualDentry{
mount: nextmnt,
dentry: nextmnt.root,
@@ -260,16 +266,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
}
// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
+//
+// This method returns the mounted Mount without a reference, for convenience
+// during VFS setup when there is no chance of racing with unmount.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
if err != nil {
- return err
+ return nil, err
}
+ defer mnt.DecRef(ctx)
if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
- mnt.DecRef()
- return err
+ return nil, err
}
- return nil
+ return mnt, nil
}
// UmountAt removes the Mount at the given path.
@@ -279,6 +289,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
}
// MNT_FORCE is currently unimplemented except for the permission check.
+ // Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+ // namespace, and not in the owner user namespace for the target mount. See
+ // fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
return syserror.EPERM
}
@@ -287,13 +300,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
if err != nil {
return err
}
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
if vd.dentry != vd.mount.root {
return syserror.EINVAL
}
vfs.mountMu.Lock()
if mntns := MountNamespaceFromContext(ctx); mntns != nil {
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if mntns != vd.mount.ns {
vfs.mountMu.Unlock()
return syserror.EINVAL
@@ -329,14 +342,15 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
return nil
}
+// +stateify savable
type umountRecursiveOptions struct {
// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
// on umounted mounts fail.
@@ -363,8 +377,9 @@ type umountRecursiveOptions struct {
//
// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
//
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
if !mnt.umounted {
mnt.umounted = true
@@ -393,10 +408,19 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
// connectLocked makes vd the mount parent/point for mnt. It consumes
// references held by vd.
//
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * d.mu must be locked.
+// * mnt.parent() == nil, i.e. mnt must not already be connected.
func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
- mnt.storeKey(vd)
+ if checkInvariants {
+ if mnt.parent() != nil {
+ panic("VFS.connectLocked called on connected mount")
+ }
+ }
+ mnt.IncRef() // dropped by callers of umountRecursiveLocked
+ mnt.setKey(vd)
if vd.mount.children == nil {
vd.mount.children = make(map[*Mount]struct{})
}
@@ -416,11 +440,18 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
// disconnectLocked makes vd have no mount parent/point and returns its old
// mount parent/point with a reference held.
//
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. mnt.parent() != nil.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
- vd := mnt.loadKey()
- mnt.storeKey(VirtualDentry{})
+ vd := mnt.getKey()
+ if checkInvariants {
+ if vd.mount != nil {
+ panic("VFS.disconnectLocked called on disconnected mount")
+ }
+ }
+ mnt.loadKey(VirtualDentry{})
delete(vd.mount.children, mnt)
atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
mnt.ns.mountpoints[vd.dentry]--
@@ -461,36 +492,43 @@ func (mnt *Mount) IncRef() {
}
// DecRef decrements mnt's reference count.
-func (mnt *Mount) DecRef() {
- refs := atomic.AddInt64(&mnt.refs, -1)
- if refs&^math.MinInt64 == 0 { // mask out MSB
- var vd VirtualDentry
- if mnt.parent() != nil {
- mnt.vfs.mountMu.Lock()
- mnt.vfs.mounts.seq.BeginWrite()
- vd = mnt.vfs.disconnectLocked(mnt)
- mnt.vfs.mounts.seq.EndWrite()
- mnt.vfs.mountMu.Unlock()
- }
- mnt.root.DecRef()
- mnt.fs.DecRef()
- if vd.Ok() {
- vd.DecRef()
+func (mnt *Mount) DecRef(ctx context.Context) {
+ r := atomic.AddInt64(&mnt.refs, -1)
+ if r&^math.MinInt64 == 0 { // mask out MSB
+ if refsvfs2.LeakCheckEnabled() {
+ refsvfs2.Unregister(mnt, "vfs.Mount")
}
+ mnt.destroy(ctx)
}
}
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
- if atomic.AddInt64(&mntns.refs, 1) <= 1 {
- panic("MountNamespace.IncRef() called without holding a reference")
+func (mnt *Mount) destroy(ctx context.Context) {
+ var vd VirtualDentry
+ if mnt.parent() != nil {
+ mnt.vfs.mountMu.Lock()
+ mnt.vfs.mounts.seq.BeginWrite()
+ vd = mnt.vfs.disconnectLocked(mnt)
+ mnt.vfs.mounts.seq.EndWrite()
+ mnt.vfs.mountMu.Unlock()
}
+ if mnt.root != nil {
+ mnt.root.DecRef(ctx)
+ }
+ mnt.fs.DecRef(ctx)
+ if vd.Ok() {
+ vd.DecRef(ctx)
+ }
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (mnt *Mount) LeakMessage() string {
+ return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs))
}
// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
+func (mntns *MountNamespace) DecRef(ctx context.Context) {
vfs := mntns.root.fs.VirtualFilesystem()
- if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+ mntns.MountNamespaceRefs.DecRef(func() {
vfs.mountMu.Lock()
vfs.mounts.seq.BeginWrite()
vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
@@ -499,14 +537,12 @@ func (mntns *MountNamespace) DecRef() {
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
- } else if refs < 0 {
- panic("MountNamespace.DecRef() called without holding a reference")
- }
+ })
}
// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -516,7 +552,7 @@ func (mntns *MountNamespace) DecRef() {
// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
//
// Preconditions: References are held on mnt and d.
-func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
+func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
// The first mount is special-cased:
//
// - The caller is assumed to have checked d.isMounted() already. (This
@@ -547,7 +583,7 @@ retryFirst:
// Raced with umount.
continue
}
- mnt.DecRef()
+ mnt.DecRef(ctx)
mnt = next
d = next.root
}
@@ -558,9 +594,10 @@ retryFirst:
// mnt. It takes a reference on the returned VirtualDentry. If no such mount
// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
//
-// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
-// mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
+// Preconditions:
+// * References are held on mnt and root.
+// * vfsroot is not (mnt, mnt.root).
+func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
// The first mount is special-cased:
//
// - The caller must have already checked mnt against vfsroot.
@@ -584,12 +621,12 @@ retryFirst:
if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can only
// happen due to a racing change to Mount.key.
- parent.DecRef()
+ parent.DecRef(ctx)
goto retryFirst
}
if !vfs.mounts.seq.ReadOk(epoch) {
- point.DecRef()
- parent.DecRef()
+ point.DecRef(ctx)
+ parent.DecRef(ctx)
goto retryFirst
}
mnt = parent
@@ -617,22 +654,29 @@ retryFirst:
if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can
// only happen due to a racing change to Mount.key.
- parent.DecRef()
+ parent.DecRef(ctx)
goto retryNotFirst
}
if !vfs.mounts.seq.ReadOk(epoch) {
- point.DecRef()
- parent.DecRef()
+ point.DecRef(ctx)
+ parent.DecRef(ctx)
goto retryNotFirst
}
- d.DecRef()
- mnt.DecRef()
+ d.DecRef(ctx)
+ mnt.DecRef(ctx)
mnt = parent
d = point
}
return VirtualDentry{mnt, d}
}
+// SetMountReadOnly sets the mount as ReadOnly.
+func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ return mnt.setReadOnlyLocked(ro)
+}
+
// CheckBeginWrite increments the counter of in-progress write operations on
// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
// EROFS.
@@ -670,7 +714,8 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
return nil
}
-func (mnt *Mount) readOnly() bool {
+// ReadOnly returns true if mount is readonly.
+func (mnt *Mount) ReadOnly() bool {
return atomic.LoadInt64(&mnt.writers) < 0
}
@@ -698,14 +743,12 @@ func (mnt *Mount) Root() *Dentry {
return mnt.root
}
-// Root returns mntns' root. A reference is taken on the returned
-// VirtualDentry.
+// Root returns mntns' root. It does not take a reference on the returned Dentry.
func (mntns *MountNamespace) Root() VirtualDentry {
vd := VirtualDentry{
mount: mntns.root,
dentry: mntns.root.root,
}
- vd.IncRef()
return vd
}
@@ -713,11 +756,23 @@ func (mntns *MountNamespace) Root() VirtualDentry {
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
- vfs.mountMu.Lock()
- defer vfs.mountMu.Unlock()
rootMnt := taskRootDir.mount
+
+ vfs.mountMu.Lock()
mounts := rootMnt.submountsLocked()
+ // Take a reference on mounts since we need to drop vfs.mountMu before
+ // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
+ for _, mnt := range mounts {
+ mnt.IncRef()
+ }
+ vfs.mountMu.Unlock()
+ defer func() {
+ for _, mnt := range mounts {
+ mnt.DecRef(ctx)
+ }
+ }()
sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
for _, mnt := range mounts {
// Get the path to this mount relative to task root.
mntRootVD := VirtualDentry{
@@ -728,7 +783,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
if err != nil {
// For some reason we didn't get a path. Log a warning
// and run with empty path.
- ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
path = ""
}
if path == "" {
@@ -738,10 +793,13 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
}
opts := "rw"
- if mnt.readOnly() {
+ if mnt.ReadOnly() {
opts = "ro"
}
- if mnt.flags.NoExec {
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
opts += ",noexec"
}
@@ -759,11 +817,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
- vfs.mountMu.Lock()
- defer vfs.mountMu.Unlock()
rootMnt := taskRootDir.mount
+
+ vfs.mountMu.Lock()
mounts := rootMnt.submountsLocked()
+ // Take a reference on mounts since we need to drop vfs.mountMu before
+ // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
+ // vfs.StatAt() (=> FilesystemImpl.StatAt()).
+ for _, mnt := range mounts {
+ mnt.IncRef()
+ }
+ vfs.mountMu.Unlock()
+ defer func() {
+ for _, mnt := range mounts {
+ mnt.DecRef(ctx)
+ }
+ }()
sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
+ creds := auth.CredentialsFromContext(ctx)
for _, mnt := range mounts {
// Get the path to this mount relative to task root.
mntRootVD := VirtualDentry{
@@ -774,7 +846,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
if err != nil {
// For some reason we didn't get a path. Log a warning
// and run with empty path.
- ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
path = ""
}
if path == "" {
@@ -787,9 +859,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
Root: mntRootVD,
Start: mntRootVD,
}
- statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+ statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
if err != nil {
// Well that's not good. Ignore this mount.
+ ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
break
}
@@ -801,6 +874,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
fmt.Fprintf(buf, "%d ", mnt.ID)
// (2) Parent ID (or this ID if there is no parent).
+ // Note that even if the call to mnt.parent() races with Mount
+ // destruction (which is possible since we're not holding vfs.mountMu),
+ // its Mount.ID will still be valid.
pID := mnt.ID
if p := mnt.parent(); p != nil {
pID = p.ID
@@ -823,14 +899,15 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
// (6) Mount options.
opts := "rw"
- if mnt.readOnly() {
+ if mnt.ReadOnly() {
opts = "ro"
}
- if mnt.flags.NoExec {
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
opts += ",noexec"
}
- // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
- // set.
fmt.Fprintf(buf, "%s ", opts)
// (7) Optional fields: zero or more fields of the form "tag[:value]".
@@ -861,7 +938,7 @@ func superBlockOpts(mountPath string, mnt *Mount) string {
// gVisor doesn't (yet) have a concept of super block options, so we
// use the ro/rw bit from the mount flag.
opts := "rw"
- if mnt.readOnly() {
+ if mnt.ReadOnly() {
opts = "ro"
}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3335e4057..cb8c56bd3 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
mt.Init()
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.setKey(VirtualDentry{&Mount{}, &Dentry{}})
mt.Insert(mount)
if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false
func newBenchMount() *Mount {
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}})
return mount
}
@@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
mt.Insert(mount)
- keys = append(keys, mount.loadKey())
+ keys = append(keys, mount.saveKey())
}
var ready sync.WaitGroup
@@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.saveKey()
ms[key] = mount
keys = append(keys, key)
}
@@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.getKey()
ms.Store(key, mount)
keys = append(keys, key)
}
@@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms[mount.loadKey()] = mount
+ ms[mount.getKey()] = mount
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
var ms sync.Map
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
}
@@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
}
@@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- delete(ms, mount.loadKey())
+ delete(ms, mount.saveKey())
}
}
@@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
var ms sync.Map
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Delete(mount.loadKey())
+ ms.Delete(mount.saveKey())
}
}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 70f850ca4..cb48c37a1 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.16
+// +build !go1.17
// Check go:linkname function signatures when updating Go version.
@@ -34,6 +34,8 @@ import (
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
+//
+// This is explicitly not savable.
type mountKey struct {
parent unsafe.Pointer // *Mount
point unsafe.Pointer // *Dentry
@@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
-func (mnt *Mount) loadKey() VirtualDentry {
+func (mnt *Mount) getKey() VirtualDentry {
return VirtualDentry{
mount: mnt.parent(),
dentry: mnt.point(),
}
}
+func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }
+
// Invariant: mnt.key.parent == nil. vd.Ok().
-func (mnt *Mount) storeKey(vd VirtualDentry) {
+func (mnt *Mount) setKey(vd VirtualDentry) {
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}
+func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }
+
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
@@ -205,6 +211,26 @@ loop:
}
}
+// Range calls f on each Mount in mt. If f returns false, Range stops iteration
+// and returns immediately.
+func (mt *mountTable) Range(f func(*Mount) bool) {
+ tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+ slotPtr := mt.slots
+ last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes))
+ for {
+ slot := (*mountSlot)(slotPtr)
+ if slot.value != nil {
+ if !f((*Mount)(slot.value)) {
+ return
+ }
+ }
+ if slotPtr == last {
+ return
+ }
+ slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes)
+ }
+}
+
// Insert inserts the given mount into mt.
//
// Preconditions: mt must not already contain a Mount with the same mount point
@@ -217,8 +243,9 @@ func (mt *mountTable) Insert(mount *Mount) {
// insertSeqed inserts the given mount into mt.
//
-// Preconditions: mt.seq must be in a writer critical section. mt must not
-// already contain a Mount with the same mount point and parent.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must not already contain a Mount with the same mount point and parent.
func (mt *mountTable) insertSeqed(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
@@ -269,9 +296,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) {
atomic.StorePointer(&mt.slots, newSlots)
}
-// Preconditions: There are no concurrent mutators of the table (slots, cap).
-// If the table is visible to readers, then mt.seq must be in a writer critical
-// section. cap must be a power of 2.
+// Preconditions:
+// * There are no concurrent mutators of the table (slots, cap).
+// * If the table is visible to readers, then mt.seq must be in a writer
+// critical section.
+// * cap must be a power of 2.
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
mask := cap - 1
off := (hash & mask) * mountSlotBytes
@@ -313,8 +342,9 @@ func (mt *mountTable) Remove(mount *Mount) {
// removeSeqed removes the given mount from mt.
//
-// Preconditions: mt.seq must be in a writer critical section. mt must contain
-// mount.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must contain mount.
func (mt *mountTable) removeSeqed(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 53d364c5c..bc79e5ecc 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -21,6 +21,8 @@ import (
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
// FilesystemImpl.GetDentryAt().
+//
+// +stateify savable
type GetDentryOptions struct {
// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
// the returned Dentry is a directory for which creds has search
@@ -30,6 +32,8 @@ type GetDentryOptions struct {
// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
// FilesystemImpl.MkdirAt().
+//
+// +stateify savable
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
@@ -56,6 +60,8 @@ type MkdirOptions struct {
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
// FilesystemImpl.MknodAt().
+//
+// +stateify savable
type MknodOptions struct {
// Mode is the file type and mode bits for the created file.
Mode linux.FileMode
@@ -72,12 +78,31 @@ type MknodOptions struct {
// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+//
+// +stateify savable
type MountFlags struct {
// NoExec is equivalent to MS_NOEXEC.
NoExec bool
+
+ // NoATime is equivalent to MS_NOATIME and indicates that the
+ // filesystem should not update access time in-place.
+ NoATime bool
+
+ // NoDev is equivalent to MS_NODEV and indicates that the
+ // filesystem should not allow access to devices (special files).
+ // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE
+ // filesystems.
+ NoDev bool
+
+ // NoSUID is equivalent to MS_NOSUID and indicates that the
+ // filesystem should not honor set-user-ID and set-group-ID bits or
+ // file capabilities when executing programs.
+ NoSUID bool
}
// MountOptions contains options to VirtualFilesystem.MountAt().
+//
+// +stateify savable
type MountOptions struct {
// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
Flags MountFlags
@@ -88,13 +113,17 @@ type MountOptions struct {
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
GetFilesystemOptions GetFilesystemOptions
- // If InternalMount is true, allow the use of filesystem types for which
- // RegisterFilesystemTypeOptions.AllowUserMount == false.
+ // InternalMount indicates whether the mount operation is coming from the
+ // application, i.e. through mount(2). If InternalMount is true, allow the use
+ // of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount
+ // == false.
InternalMount bool
}
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
// FilesystemImpl.OpenAt().
+//
+// +stateify savable
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
@@ -120,6 +149,8 @@ type OpenOptions struct {
// ReadOptions contains options to FileDescription.PRead(),
// FileDescriptionImpl.PRead(), FileDescription.Read(), and
// FileDescriptionImpl.Read().
+//
+// +stateify savable
type ReadOptions struct {
// Flags contains flags as specified for preadv2(2).
Flags uint32
@@ -127,6 +158,8 @@ type ReadOptions struct {
// RenameOptions contains options to VirtualFilesystem.RenameAt() and
// FilesystemImpl.RenameAt().
+//
+// +stateify savable
type RenameOptions struct {
// Flags contains flags as specified for renameat2(2).
Flags uint32
@@ -138,6 +171,8 @@ type RenameOptions struct {
// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
// FileDescriptionImpl.SetStat().
+//
+// +stateify savable
type SetStatOptions struct {
// Stat is the metadata that should be set. Only fields indicated by
// Stat.Mask should be set.
@@ -149,10 +184,18 @@ type SetStatOptions struct {
// == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
// instead).
Stat linux.Statx
+
+ // NeedWritePerm indicates that write permission on the file is needed for
+ // this operation. This is needed for truncate(2) (note that ftruncate(2)
+ // does not require the same check--instead, it checks that the fd is
+ // writable).
+ NeedWritePerm bool
}
// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
// and FilesystemImpl.BoundEndpointAt().
+//
+// +stateify savable
type BoundEndpointOptions struct {
// Addr is the path of the file whose socket endpoint is being retrieved.
// It is generally irrelevant: most endpoints are stored at a dentry that
@@ -169,10 +212,12 @@ type BoundEndpointOptions struct {
Addr string
}
-// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
-// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
-// FileDescriptionImpl.Getxattr().
-type GetxattrOptions struct {
+// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
+// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
+// FileDescriptionImpl.GetXattr().
+//
+// +stateify savable
+type GetXattrOptions struct {
// Name is the name of the extended attribute to retrieve.
Name string
@@ -183,10 +228,12 @@ type GetxattrOptions struct {
Size uint64
}
-// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
-// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
-// FileDescriptionImpl.Setxattr().
-type SetxattrOptions struct {
+// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
+// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
+// FileDescriptionImpl.SetXattr().
+//
+// +stateify savable
+type SetXattrOptions struct {
// Name is the name of the extended attribute being mutated.
Name string
@@ -200,6 +247,8 @@ type SetxattrOptions struct {
// StatOptions contains options to VirtualFilesystem.StatAt(),
// FilesystemImpl.StatAt(), FileDescription.Stat(), and
// FileDescriptionImpl.Stat().
+//
+// +stateify savable
type StatOptions struct {
// Mask is the set of fields in the returned Statx that the FilesystemImpl
// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
@@ -217,6 +266,8 @@ type StatOptions struct {
}
// UmountOptions contains options to VirtualFilesystem.UmountAt().
+//
+// +stateify savable
type UmountOptions struct {
// Flags contains flags as specified for umount2(2).
Flags uint32
@@ -225,6 +276,8 @@ type UmountOptions struct {
// WriteOptions contains options to FileDescription.PWrite(),
// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
// FileDescriptionImpl.Write().
+//
+// +stateify savable
type WriteOptions struct {
// Flags contains flags as specified for pwritev2(2).
Flags uint32
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index cd78d66bc..e4da15009 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot,
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
@@ -64,12 +64,12 @@ loop:
// of FilesystemImpl.PrependPath() may return nil instead.
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
break loop
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
@@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
loop:
@@ -112,12 +112,12 @@ loop:
if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
return "", nil
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
@@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
unreachable := false
@@ -157,13 +157,13 @@ loop:
if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
unreachable = true
break loop
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f9647f90e..d48520d58 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -16,6 +16,7 @@ package vfs
import (
"math"
+ "strings"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -25,6 +26,8 @@ import (
)
// AccessTypes is a bitmask of Unix file permissions.
+//
+// +stateify savable
type AccessTypes uint16
// Bits in AccessTypes.
@@ -94,6 +97,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu
return syserror.EACCES
}
+// MayLink determines whether creating a hard link to a file with the given
+// mode, kuid, and kgid is permitted.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ // Source inode owner can hardlink all they like; otherwise, it must be a
+ // safe source.
+ if CanActAsOwner(creds, kuid) {
+ return nil
+ }
+
+ // Only regular files can be hard linked.
+ if mode.FileType() != linux.S_IFREG {
+ return syserror.EPERM
+ }
+
+ // Setuid files should not get pinned to the filesystem.
+ if mode&linux.S_ISUID != 0 {
+ return syserror.EPERM
+ }
+
+ // Executable setgid files should not get pinned to the filesystem, but we
+ // don't support S_IXGRP anyway.
+
+ // Hardlinking to unreadable or unwritable sources is dangerous.
+ if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil {
+ return syserror.EPERM
+ }
+ return nil
+}
+
// AccessTypesForOpenFlags returns the access types required to open a file
// with the given OpenOptions.Flags. Note that this is NOT the same thing as
// the set of accesses permitted for the opened file:
@@ -152,7 +186,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
// CheckSetStat checks that creds has permission to change the metadata of a
// file with the given permissions, UID, and GID as specified by stat, subject
// to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ stat := &opts.Stat
if stat.Mask&linux.STATX_SIZE != 0 {
limit, err := CheckLimit(ctx, 0, int64(stat.Size))
if err != nil {
@@ -184,6 +219,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
return syserror.EPERM
}
}
+ if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
+ if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
+ return err
+ }
+ }
if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
if !CanActAsOwner(creds, kuid) {
if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
@@ -199,6 +239,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
return nil
}
+// CheckDeleteSticky checks whether the sticky bit is set on a directory with
+// the given file mode, and if so, checks whether creds has permission to
+// remove a file owned by childKUID from a directory with the given mode.
+// CheckDeleteSticky is consistent with fs/linux.h:check_sticky().
+func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error {
+ if parentMode&linux.ModeSticky == 0 {
+ return nil
+ }
+ if CanActAsOwner(creds, childKUID) {
+ return nil
+ }
+ return syserror.EPERM
+}
+
// CanActAsOwner returns true if creds can act as the owner of a file with the
// given owning UID, consistent with Linux's
// fs/inode.c:inode_owner_or_capable().
@@ -220,7 +274,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth
// operation must not proceed. Otherwise it returns the max length allowed to
// without violating the limit.
func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
- fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+ fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur
if fileSizeLimit > math.MaxInt64 {
return size, nil
}
@@ -233,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
}
return size, nil
}
+
+// CheckXattrPermissions checks permissions for extended attribute access.
+// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
+// * Does not check for read-only filesystem property.
+// * Does not check inode immutability or append only mode. In both cases EPERM
+// must be returned by filesystem implementations.
+// * Does not do inode permission checks. Filesystem implementations should
+// handle inode permission checks as they may differ across implementations.
+func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
+ switch {
+ case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+ // The trusted.* namespace can only be accessed by privileged
+ // users.
+ if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+ return nil
+ }
+ if ats.MayWrite() {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+ // In the user.* namespace, only regular files and directories can have
+ // extended attributes. For sticky directories, only the owner and
+ // privileged users can write attributes.
+ filetype := mode.FileType()
+ if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
+ if ats.MayWrite() {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ }
+ if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
+ return syserror.EPERM
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 9d047ff88..e4fd55012 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sync"
@@ -34,6 +35,8 @@ import (
// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
+//
+// +stateify savable
type ResolvingPath struct {
vfs *VirtualFilesystem
root VirtualDentry // refs borrowed from PathOperation
@@ -87,6 +90,7 @@ func init() {
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.
+// +stateify savable
type resolveMountRootOrJumpError struct{}
// Error implements error.Error.
@@ -94,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string {
return "resolving mount root or jump"
}
+// +stateify savable
type resolveMountPointError struct{}
// Error implements error.Error.
@@ -101,6 +106,7 @@ func (resolveMountPointError) Error() string {
return "resolving mount point"
}
+// +stateify savable
type resolveAbsSymlinkError struct{}
// Error implements error.Error.
@@ -136,31 +142,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat
return rp
}
-func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
+func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) {
rp.root = VirtualDentry{}
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = nil
rp.start = nil
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
resolvingPathPool.Put(rp)
}
-func (rp *ResolvingPath) decRefStartAndMount() {
+func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) {
if rp.flags&rpflagsHaveStartRef != 0 {
- rp.start.DecRef()
+ rp.start.DecRef(ctx)
}
if rp.flags&rpflagsHaveMountRef != 0 {
- rp.mount.DecRef()
+ rp.mount.DecRef(ctx)
}
}
-func (rp *ResolvingPath) releaseErrorState() {
+func (rp *ResolvingPath) releaseErrorState(ctx context.Context) {
if rp.nextStart != nil {
- rp.nextStart.DecRef()
+ rp.nextStart.DecRef(ctx)
rp.nextStart = nil
}
if rp.nextMount != nil {
- rp.nextMount.DecRef()
+ rp.nextMount.DecRef(ctx)
rp.nextMount = nil
}
}
@@ -236,13 +242,13 @@ func (rp *ResolvingPath) Advance() {
// Restart resets the stream of path components represented by rp to its state
// on entry to the current FilesystemImpl method.
-func (rp *ResolvingPath) Restart() {
+func (rp *ResolvingPath) Restart(ctx context.Context) {
rp.pit = rp.origParts[rp.numOrigParts-1]
rp.mustBeDir = rp.mustBeDirOrig
rp.symlinks = rp.symlinksOrig
rp.curPart = rp.numOrigParts - 1
copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
}
func (rp *ResolvingPath) relpathCommit() {
@@ -260,13 +266,13 @@ func (rp *ResolvingPath) relpathCommit() {
// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
// resolution should resolve d's parent normally, and CheckRoot returns (false,
// nil).
-func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
+func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) {
if d == rp.root.dentry && rp.mount == rp.root.mount {
// At contextual VFS root (due to e.g. chroot(2)).
return true, nil
} else if d == rp.mount.root {
// At mount root ...
- vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+ vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root)
if vd.Ok() {
// ... of non-root mount.
rp.nextMount = vd.mount
@@ -283,11 +289,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
// to d. If d is a mount point, such that path resolution should switch to
// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
// returns nil.
-func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error {
if !d.isMounted() {
return nil
}
- if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+ if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil {
rp.nextMount = mnt
return resolveMountPointError{}
}
@@ -389,11 +395,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
return resolveMountRootOrJumpError{}
}
-func (rp *ResolvingPath) handleError(err error) bool {
+func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
switch err.(type) {
case resolveMountRootOrJumpError:
// Switch to the new Mount. We hold references on the Mount and Dentry.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.nextMount
rp.start = rp.nextStart
rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
@@ -412,7 +418,7 @@ func (rp *ResolvingPath) handleError(err error) bool {
case resolveMountPointError:
// Switch to the new Mount. We hold a reference on the Mount, but
// borrow the reference on the mount root from the Mount.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.nextMount
rp.start = rp.nextMount.root
rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
@@ -423,12 +429,12 @@ func (rp *ResolvingPath) handleError(err error) bool {
// path.
rp.relpathCommit()
// Restart path resolution on the new Mount.
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
return true
case resolveAbsSymlinkError:
// Switch to the new Mount. References are borrowed from rp.root.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.root.mount
rp.start = rp.root.dentry
rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
@@ -440,7 +446,7 @@ func (rp *ResolvingPath) handleError(err error) bool {
// path, including the symlink target we just prepended.
rp.relpathCommit()
// Restart path resolution on the new Mount.
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
return true
default:
diff --git a/pkg/sentry/vfs/save_restore.go b/pkg/sentry/vfs/save_restore.go
new file mode 100644
index 000000000..46e50d55d
--- /dev/null
+++ b/pkg/sentry/vfs/save_restore.go
@@ -0,0 +1,124 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+// FilesystemImplSaveRestoreExtension is an optional extension to
+// FilesystemImpl.
+type FilesystemImplSaveRestoreExtension interface {
+ // PrepareSave prepares this filesystem for serialization.
+ PrepareSave(ctx context.Context) error
+
+ // CompleteRestore completes restoration from checkpoint for this
+ // filesystem after deserialization.
+ CompleteRestore(ctx context.Context, opts CompleteRestoreOptions) error
+}
+
+// PrepareSave prepares all filesystems for serialization.
+func (vfs *VirtualFilesystem) PrepareSave(ctx context.Context) error {
+ failures := 0
+ for fs := range vfs.getFilesystems() {
+ if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
+ if err := ext.PrepareSave(ctx); err != nil {
+ ctx.Warningf("%T.PrepareSave failed: %v", fs.impl, err)
+ failures++
+ }
+ }
+ fs.DecRef(ctx)
+ }
+ if failures != 0 {
+ return fmt.Errorf("%d filesystems failed to prepare for serialization", failures)
+ }
+ return nil
+}
+
+// CompleteRestore completes restoration from checkpoint for all filesystems
+// after deserialization.
+func (vfs *VirtualFilesystem) CompleteRestore(ctx context.Context, opts *CompleteRestoreOptions) error {
+ failures := 0
+ for fs := range vfs.getFilesystems() {
+ if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
+ if err := ext.CompleteRestore(ctx, *opts); err != nil {
+ ctx.Warningf("%T.CompleteRestore failed: %v", fs.impl, err)
+ failures++
+ }
+ }
+ fs.DecRef(ctx)
+ }
+ if failures != 0 {
+ return fmt.Errorf("%d filesystems failed to complete restore after deserialization", failures)
+ }
+ return nil
+}
+
+// CompleteRestoreOptions contains options to
+// VirtualFilesystem.CompleteRestore() and
+// FilesystemImplSaveRestoreExtension.CompleteRestore().
+type CompleteRestoreOptions struct {
+ // If ValidateFileSizes is true, filesystem implementations backed by
+ // remote filesystems should verify that file sizes have not changed
+ // between checkpoint and restore.
+ ValidateFileSizes bool
+
+ // If ValidateFileModificationTimestamps is true, filesystem
+ // implementations backed by remote filesystems should validate that file
+ // mtimes have not changed between checkpoint and restore.
+ ValidateFileModificationTimestamps bool
+}
+
+// saveMounts is called by stateify.
+func (vfs *VirtualFilesystem) saveMounts() []*Mount {
+ if atomic.LoadPointer(&vfs.mounts.slots) == nil {
+ // vfs.Init() was never called.
+ return nil
+ }
+ var mounts []*Mount
+ vfs.mounts.Range(func(mount *Mount) bool {
+ mounts = append(mounts, mount)
+ return true
+ })
+ return mounts
+}
+
+// loadMounts is called by stateify.
+func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) {
+ if mounts == nil {
+ return
+ }
+ vfs.mounts.Init()
+ for _, mount := range mounts {
+ vfs.mounts.Insert(mount)
+ }
+}
+
+func (mnt *Mount) afterLoad() {
+ if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&mnt.refs) != 0 {
+ refsvfs2.Register(mnt, "vfs.Mount")
+ }
+}
+
+// afterLoad is called by stateify.
+func (epi *epollInterest) afterLoad() {
+ // Mark all epollInterests as ready after restore so that the next call to
+ // EpollInstance.ReadEvents() rechecks their readiness.
+ epi.Callback(nil)
+}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 52643a7c5..48d6252f7 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,6 +24,9 @@
// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
// VirtualFilesystem.filesystemsMu
// EpollInstance.mu
+// Inotify.mu
+// Watches.mu
+// Inotify.evMu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
@@ -33,6 +36,7 @@ package vfs
import (
"fmt"
+ "path"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -67,7 +71,7 @@ type VirtualFilesystem struct {
// points.
//
// mounts is analogous to Linux's mount_hashtable.
- mounts mountTable
+ mounts mountTable `state:".([]*Mount)"`
// mountpoints maps mount points to mounts at those points in all
// namespaces. mountpoints is protected by mountMu.
@@ -119,7 +123,10 @@ type VirtualFilesystem struct {
}
// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
-func (vfs *VirtualFilesystem) Init() error {
+func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
+ if vfs.mountpoints != nil {
+ panic("VFS already initialized")
+ }
vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
vfs.devices = make(map[devTuple]*registeredDevice)
vfs.anonBlockDevMinorNext = 1
@@ -139,7 +146,7 @@ func (vfs *VirtualFilesystem) Init() error {
devMinor: anonfsDevMinor,
}
anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
- defer anonfs.vfsfs.DecRef()
+ defer anonfs.vfsfs.DecRef(ctx)
anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
if err != nil {
// We should not be passing any MountOptions that would cause
@@ -151,11 +158,23 @@ func (vfs *VirtualFilesystem) Init() error {
return nil
}
+// Release drops references on filesystem objects held by vfs.
+//
+// Precondition: This must be called after VFS.Init() has succeeded.
+func (vfs *VirtualFilesystem) Release(ctx context.Context) {
+ vfs.anonMount.DecRef(ctx)
+ for _, fst := range vfs.fsTypes {
+ fst.fsType.Release(ctx)
+ }
+}
+
// PathOperation specifies the path operated on by a VFS method.
//
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
// it's somewhat large and should never escape. (Options structs are passed by
// pointer to VFS and FileDescription methods for the same reason.)
+//
+// +stateify savable
type PathOperation struct {
// Root is the VFS root. References on Root are borrowed from the provider
// of the PathOperation.
@@ -186,11 +205,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -208,11 +227,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
dentry: d,
}
rp.mount.IncRef()
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return vd, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return VirtualDentry{}, err
}
}
@@ -230,7 +249,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
}
rp.mount.IncRef()
name := rp.Component()
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return parentVD, name, nil
}
if checkInvariants {
@@ -238,8 +257,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return VirtualDentry{}, "", err
}
}
@@ -254,14 +273,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
}
if !newpop.Path.Begin.Ok() {
- oldVD.DecRef()
+ oldVD.DecRef(ctx)
if newpop.Path.Absolute {
return syserror.EEXIST
}
return syserror.ENOENT
}
if newpop.FollowFinalSymlink {
- oldVD.DecRef()
+ oldVD.DecRef(ctx)
ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
return syserror.EINVAL
}
@@ -270,8 +289,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
for {
err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
if err == nil {
- vfs.putResolvingPath(rp)
- oldVD.DecRef()
+ vfs.putResolvingPath(ctx, rp)
+ oldVD.DecRef(ctx)
return nil
}
if checkInvariants {
@@ -279,9 +298,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- oldVD.DecRef()
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
+ oldVD.DecRef(ctx)
return err
}
}
@@ -290,6 +309,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
// MkdirAt creates a directory at the given path.
func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with mkdirat(dirfd, "", mode).
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -307,7 +328,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -315,8 +336,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -326,6 +347,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
// error from the syserror package.
func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with mknodat(dirfd, "", mode, dev).
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -340,7 +363,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -348,8 +371,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -402,31 +425,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
for {
fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
if opts.FileExec {
- if fd.Mount().flags.NoExec {
- fd.DecRef()
+ if fd.Mount().Flags.NoExec {
+ fd.DecRef(ctx)
return nil, syserror.EACCES
}
// Only a regular file can be executed.
stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
if err != nil {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, err
}
if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EACCES
}
}
- fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
+ fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent)
return fd, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
@@ -438,11 +461,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
for {
target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return target, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return "", err
}
}
@@ -466,19 +489,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
return err
}
if oldName == "." || oldName == ".." {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
return syserror.EBUSY
}
if !newpop.Path.Begin.Ok() {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
if newpop.Path.Absolute {
return syserror.EBUSY
}
return syserror.ENOENT
}
if newpop.FollowFinalSymlink {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
return syserror.EINVAL
}
@@ -491,8 +514,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
if err == nil {
- vfs.putResolvingPath(rp)
- oldParentVD.DecRef()
+ vfs.putResolvingPath(ctx, rp)
+ oldParentVD.DecRef(ctx)
return nil
}
if checkInvariants {
@@ -500,9 +523,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- oldParentVD.DecRef()
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
+ oldParentVD.DecRef(ctx)
return err
}
}
@@ -511,6 +534,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
// RmdirAt removes the directory at the given path.
func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
if pop.Path.Absolute {
return syserror.EBUSY
}
@@ -525,7 +550,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.RmdirAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -533,8 +558,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -546,11 +571,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -562,11 +587,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
for {
stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return stat, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return linux.Statx{}, err
}
}
@@ -579,11 +604,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
for {
statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return statfs, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return linux.Statfs{}, err
}
}
@@ -592,6 +617,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
// SymlinkAt creates a symbolic link at the given path with the given target.
func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with symlinkat(oldpath, newdirfd, "").
if pop.Path.Absolute {
return syserror.EEXIST
}
@@ -606,7 +633,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -614,8 +641,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -624,6 +651,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
// UnlinkAt deletes the non-directory file at the given path.
func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
if !pop.Path.Begin.Ok() {
+ // pop.Path should not be empty in operations that create/delete files.
+ // This is consistent with unlinkat(dirfd, "", 0).
if pop.Path.Absolute {
return syserror.EBUSY
}
@@ -638,7 +667,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -646,8 +675,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -655,17 +684,11 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
- if !pop.Path.Begin.Ok() {
- if pop.Path.Absolute {
- return nil, syserror.ECONNREFUSED
- }
- return nil, syserror.ENOENT
- }
rp := vfs.getResolvingPath(creds, pop)
for {
bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return bep, nil
}
if checkInvariants {
@@ -673,21 +696,21 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
}
-// ListxattrAt returns all extended attribute names for the file at the given
+// ListXattrAt returns all extended attribute names for the file at the given
// path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+ names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return names, nil
}
if err == syserror.ENOTSUP {
@@ -695,61 +718,61 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
// subsystem to return security extended attributes, which by
// default don't exist.
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
}
-// GetxattrAt returns the value associated with the given extended attribute
+// GetXattrAt returns the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+ val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return val, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return "", err
}
}
}
-// SetxattrAt changes the value associated with the given extended attribute
+// SetXattrAt changes the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
rp := vfs.getResolvingPath(creds, pop)
for {
- err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
}
-// RemovexattrAt removes the given extended attribute from the file at rp.
-func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+// RemoveXattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
rp := vfs.getResolvingPath(creds, pop)
for {
- err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -757,23 +780,83 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre
// SyncAllFilesystems has the semantics of Linux's sync(2).
func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+ var retErr error
+ for fs := range vfs.getFilesystems() {
+ if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+ retErr = err
+ }
+ fs.DecRef(ctx)
+ }
+ return retErr
+}
+
+func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} {
fss := make(map[*Filesystem]struct{})
vfs.filesystemsMu.Lock()
+ defer vfs.filesystemsMu.Unlock()
for fs := range vfs.filesystems {
if !fs.TryIncRef() {
continue
}
fss[fs] = struct{}{}
}
- vfs.filesystemsMu.Unlock()
- var retErr error
- for fs := range fss {
- if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
- retErr = err
- }
- fs.DecRef()
+ return fss
+}
+
+// MkdirAllAt recursively creates non-existent directories on the given path
+// (including the last component).
+func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
+ pop := &PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(currentPath),
+ }
+ stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
+ switch err {
+ case nil:
+ if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
+ return syserror.ENOTDIR
+ }
+ // Directory already exists.
+ return nil
+ case syserror.ENOENT:
+ // Expected, we will create the dir.
+ default:
+ return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
+ }
+
+ // Recurse to ensure parent is created and then create the final directory.
+ if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
+ return err
}
- return retErr
+ if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
+ return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
+ }
+ return nil
+}
+
+// MakeSyntheticMountpoint creates parent directories of target if they do not
+// exist and attempts to create a directory for the mountpoint. If a
+// non-directory file already exists there then we allow it.
+func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
+ mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+
+ // Make sure the parent directory of target exists.
+ if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
+ return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
+ }
+
+ // Attempt to mkdir the final component. If a file (of any type) exists
+ // then we let allow mounting on top of that because we do not require the
+ // target to be an existing directory, unlike Linux mount(2).
+ if err := vfs.MkdirAt(ctx, creds, &PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(target),
+ }, mkdirOpts); err != nil && err != syserror.EEXIST {
+ return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
+ }
+ return nil
}
// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
@@ -825,9 +908,9 @@ func (vd VirtualDentry) IncRef() {
// DecRef decrements the reference counts on the Mount and Dentry represented
// by vd.
-func (vd VirtualDentry) DecRef() {
- vd.dentry.DecRef()
- vd.mount.DecRef()
+func (vd VirtualDentry) DecRef(ctx context.Context) {
+ vd.dentry.DecRef(ctx)
+ vd.mount.DecRef(ctx)
}
// Mount returns the Mount associated with vd. It does not take a reference on
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 101497ed6..bbafb8b7f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -77,7 +77,10 @@ var DefaultOpts = Opts{
// trigger it.
const descheduleThreshold = 1 * time.Second
-var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+var (
+ stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+ stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
var stackDumpSameTaskPeriod = time.Minute
@@ -93,15 +96,33 @@ const (
Panic
)
+// Set implements flag.Value.
+func (a *Action) Set(v string) error {
+ switch v {
+ case "log", "logwarning":
+ *a = LogWarning
+ case "panic":
+ *a = Panic
+ default:
+ return fmt.Errorf("invalid watchdog action %q", v)
+ }
+ return nil
+}
+
+// Get implements flag.Value.
+func (a *Action) Get() interface{} {
+ return *a
+}
+
// String returns Action's string representation.
-func (a Action) String() string {
- switch a {
+func (a *Action) String() string {
+ switch *a {
case LogWarning:
- return "LogWarning"
+ return "logWarning"
case Panic:
- return "Panic"
+ return "panic"
default:
- panic(fmt.Sprintf("Invalid action: %d", a))
+ panic(fmt.Sprintf("Invalid watchdog action: %d", *a))
}
}
@@ -220,6 +241,9 @@ func (w *Watchdog) waitForStart() {
// We are fine.
return
}
+
+ stuckStartup.Increment()
+
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
w.doAction(w.StartupTimeoutAction, false, &buf)
@@ -323,13 +347,13 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
func (w *Watchdog) reportStuckWatchdog() {
var buf bytes.Buffer
- buf.WriteString("Watchdog goroutine is stuck:")
+ buf.WriteString("Watchdog goroutine is stuck")
w.doAction(w.TaskTimeoutAction, false, &buf)
}
// doAction will take the given action. If the action is LogWarning, the stack
-// is not always dumpped to the log to prevent log flooding. "forceStack"
-// guarantees that the stack will be dumped regarless.
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
switch action {
case LogWarning: