summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/arch/arch_aarch64.go35
-rw-r--r--pkg/sentry/arch/arch_amd64.go4
-rw-r--r--pkg/sentry/arch/arch_arm64.go6
-rw-r--r--pkg/sentry/arch/arch_x86.go16
-rw-r--r--pkg/sentry/control/BUILD5
-rw-r--r--pkg/sentry/control/logging.go4
-rw-r--r--pkg/sentry/control/pprof.go6
-rw-r--r--pkg/sentry/control/proc.go132
-rw-r--r--pkg/sentry/device/device.go3
-rw-r--r--pkg/sentry/devices/memdev/full.go3
-rw-r--r--pkg/sentry/devices/memdev/null.go3
-rw-r--r--pkg/sentry/devices/memdev/random.go3
-rw-r--r--pkg/sentry/devices/memdev/zero.go3
-rw-r--r--pkg/sentry/devices/ttydev/BUILD16
-rw-r--r--pkg/sentry/devices/ttydev/ttydev.go91
-rw-r--r--pkg/sentry/devices/tundev/BUILD23
-rw-r--r--pkg/sentry/devices/tundev/tundev.go178
-rw-r--r--pkg/sentry/fdimport/fdimport.go13
-rw-r--r--pkg/sentry/fs/copy_up.go12
-rw-r--r--pkg/sentry/fs/copy_up_test.go4
-rw-r--r--pkg/sentry/fs/dev/net_tun.go4
-rw-r--r--pkg/sentry/fs/dirent.go110
-rw-r--r--pkg/sentry/fs/dirent_cache.go3
-rw-r--r--pkg/sentry/fs/dirent_refs_test.go16
-rw-r--r--pkg/sentry/fs/dirent_state.go3
-rw-r--r--pkg/sentry/fs/fdpipe/pipe.go2
-rw-r--r--pkg/sentry/fs/fdpipe/pipe_opener_test.go16
-rw-r--r--pkg/sentry/fs/fdpipe/pipe_test.go18
-rw-r--r--pkg/sentry/fs/file.go21
-rw-r--r--pkg/sentry/fs/file_operations.go3
-rw-r--r--pkg/sentry/fs/file_overlay.go22
-rw-r--r--pkg/sentry/fs/filesystems.go14
-rw-r--r--pkg/sentry/fs/fs.go3
-rw-r--r--pkg/sentry/fs/fsutil/BUILD7
-rw-r--r--pkg/sentry/fs/fsutil/dirty_set.go7
-rw-r--r--pkg/sentry/fs/fsutil/file.go4
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go15
-rw-r--r--pkg/sentry/fs/fsutil/frame_ref_set.go10
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper.go5
-rw-r--r--pkg/sentry/fs/fsutil/host_mappable.go19
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go25
-rw-r--r--pkg/sentry/fs/g3doc/.gitignore1
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md263
-rw-r--r--pkg/sentry/fs/gofer/file.go4
-rw-r--r--pkg/sentry/fs/gofer/gofer_test.go8
-rw-r--r--pkg/sentry/fs/gofer/handles.go5
-rw-r--r--pkg/sentry/fs/gofer/inode.go7
-rw-r--r--pkg/sentry/fs/gofer/path.go6
-rw-r--r--pkg/sentry/fs/gofer/session.go16
-rw-r--r--pkg/sentry/fs/gofer/session_state.go3
-rw-r--r--pkg/sentry/fs/gofer/socket.go6
-rw-r--r--pkg/sentry/fs/host/BUILD1
-rw-r--r--pkg/sentry/fs/host/control.go2
-rw-r--r--pkg/sentry/fs/host/file.go4
-rw-r--r--pkg/sentry/fs/host/inode.go3
-rw-r--r--pkg/sentry/fs/host/inode_test.go2
-rw-r--r--pkg/sentry/fs/host/socket.go20
-rw-r--r--pkg/sentry/fs/host/socket_iovec.go7
-rw-r--r--pkg/sentry/fs/host/socket_test.go38
-rw-r--r--pkg/sentry/fs/host/tty.go10
-rw-r--r--pkg/sentry/fs/host/wait_test.go2
-rw-r--r--pkg/sentry/fs/inode.go11
-rw-r--r--pkg/sentry/fs/inode_inotify.go5
-rw-r--r--pkg/sentry/fs/inode_overlay.go30
-rw-r--r--pkg/sentry/fs/inode_overlay_test.go8
-rw-r--r--pkg/sentry/fs/inotify.go8
-rw-r--r--pkg/sentry/fs/inotify_watch.go9
-rw-r--r--pkg/sentry/fs/lock/lock.go41
-rw-r--r--pkg/sentry/fs/lock/lock_set_functions.go8
-rw-r--r--pkg/sentry/fs/lock/lock_test.go111
-rw-r--r--pkg/sentry/fs/mount.go12
-rw-r--r--pkg/sentry/fs/mount_overlay.go6
-rw-r--r--pkg/sentry/fs/mount_test.go29
-rw-r--r--pkg/sentry/fs/mounts.go102
-rw-r--r--pkg/sentry/fs/mounts_test.go2
-rw-r--r--pkg/sentry/fs/overlay.go10
-rw-r--r--pkg/sentry/fs/proc/fds.go18
-rw-r--r--pkg/sentry/fs/proc/mounts.go8
-rw-r--r--pkg/sentry/fs/proc/net.go12
-rw-r--r--pkg/sentry/fs/proc/proc.go2
-rw-r--r--pkg/sentry/fs/proc/sys_net.go95
-rw-r--r--pkg/sentry/fs/proc/task.go4
-rw-r--r--pkg/sentry/fs/ramfs/dir.go18
-rw-r--r--pkg/sentry/fs/ramfs/tree_test.go2
-rw-r--r--pkg/sentry/fs/timerfd/timerfd.go4
-rw-r--r--pkg/sentry/fs/tmpfs/file_test.go2
-rw-r--r--pkg/sentry/fs/tty/dir.go8
-rw-r--r--pkg/sentry/fs/tty/fs.go2
-rw-r--r--pkg/sentry/fs/tty/master.go8
-rw-r--r--pkg/sentry/fs/tty/slave.go4
-rw-r--r--pkg/sentry/fs/user/BUILD8
-rw-r--r--pkg/sentry/fs/user/path.go170
-rw-r--r--pkg/sentry/fs/user/user.go10
-rw-r--r--pkg/sentry/fs/user/user_test.go8
-rw-r--r--pkg/sentry/fsbridge/bridge.go2
-rw-r--r--pkg/sentry/fsbridge/fs.go8
-rw-r--r--pkg/sentry/fsbridge/vfs.go6
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD1
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go8
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go27
-rw-r--r--pkg/sentry/fsimpl/devpts/slave.go29
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go22
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go8
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go7
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd_test.go12
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD5
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_file.go9
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_test.go29
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go20
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go24
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go10
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go4
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go5
-rw-r--r--pkg/sentry/fsimpl/ext/extent_test.go22
-rw-r--r--pkg/sentry/fsimpl/ext/file_description.go1
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go68
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go28
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go31
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go16
-rw-r--r--pkg/sentry/fsimpl/fuse/BUILD63
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go437
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go397
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go428
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go324
-rw-r--r--pkg/sentry/fsimpl/fuse/init.go166
-rw-r--r--pkg/sentry/fsimpl/fuse/register.go42
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD5
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go27
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go416
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go794
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go12
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go14
-rw-r--r--pkg/sentry/fsimpl/gofer/host_named_pipe.go97
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go321
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go6
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go183
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go18
-rw-r--r--pkg/sentry/fsimpl/host/BUILD3
-rw-r--r--pkg/sentry/fsimpl/host/control.go2
-rw-r--r--pkg/sentry/fsimpl/host/host.go141
-rw-r--r--pkg/sentry/fsimpl/host/mmap.go21
-rw-r--r--pkg/sentry/fsimpl/host/socket.go21
-rw-r--r--pkg/sentry/fsimpl/host/socket_iovec.go7
-rw-r--r--pkg/sentry/fsimpl/host/tty.go21
-rw-r--r--pkg/sentry/fsimpl/host/util.go10
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD3
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go28
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go40
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go80
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go24
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go49
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go30
-rw-r--r--pkg/sentry/fsimpl/overlay/BUILD41
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go262
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go289
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go1364
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go266
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go627
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go13
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD1
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go4
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go10
-rw-r--r--pkg/sentry/fsimpl/proc/task.go8
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go27
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go100
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go12
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go8
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go49
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go10
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go5
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go4
-rw-r--r--pkg/sentry/fsimpl/sys/BUILD2
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go36
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go4
-rw-r--r--pkg/sentry/fsimpl/testutil/BUILD2
-rw-r--r--pkg/sentry/fsimpl/testutil/kernel.go14
-rw-r--r--pkg/sentry/fsimpl/testutil/testutil.go6
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go7
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD15
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go50
-rw-r--r--pkg/sentry/fsimpl/tmpfs/device_file.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go12
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go210
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go20
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go100
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go33
-rw-r--r--pkg/sentry/fsimpl/tmpfs/socket_file.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go261
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go6
-rw-r--r--pkg/sentry/inet/inet.go17
-rw-r--r--pkg/sentry/inet/test_stack.go12
-rw-r--r--pkg/sentry/kernel/BUILD3
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go13
-rw-r--r--pkg/sentry/kernel/aio.go81
-rw-r--r--pkg/sentry/kernel/auth/credentials.go28
-rw-r--r--pkg/sentry/kernel/context.go53
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go45
-rw-r--r--pkg/sentry/kernel/epoll/epoll_state.go3
-rw-r--r--pkg/sentry/kernel/epoll/epoll_test.go5
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go4
-rw-r--r--pkg/sentry/kernel/fasync/BUILD1
-rw-r--r--pkg/sentry/kernel/fasync/fasync.go18
-rw-r--r--pkg/sentry/kernel/fd_table.go105
-rw-r--r--pkg/sentry/kernel/fd_table_test.go6
-rw-r--r--pkg/sentry/kernel/fs_context.go31
-rw-r--r--pkg/sentry/kernel/futex/BUILD1
-rw-r--r--pkg/sentry/kernel/futex/futex.go43
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go66
-rw-r--r--pkg/sentry/kernel/kernel.go112
-rw-r--r--pkg/sentry/kernel/pipe/BUILD3
-rw-r--r--pkg/sentry/kernel/pipe/node.go6
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go2
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go8
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go16
-rw-r--r--pkg/sentry/kernel/pipe/pipe_unsafe.go35
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go2
-rw-r--r--pkg/sentry/kernel/pipe/reader.go3
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go255
-rw-r--r--pkg/sentry/kernel/pipe/writer.go3
-rw-r--r--pkg/sentry/kernel/ptrace.go10
-rw-r--r--pkg/sentry/kernel/sessions.go5
-rw-r--r--pkg/sentry/kernel/shm/BUILD1
-rw-r--r--pkg/sentry/kernel/shm/shm.go13
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go2
-rw-r--r--pkg/sentry/kernel/syslog.go9
-rw-r--r--pkg/sentry/kernel/task.go27
-rw-r--r--pkg/sentry/kernel/task_clone.go14
-rw-r--r--pkg/sentry/kernel/task_exec.go10
-rw-r--r--pkg/sentry/kernel/task_exit.go11
-rw-r--r--pkg/sentry/kernel/task_futex.go125
-rw-r--r--pkg/sentry/kernel/task_log.go45
-rw-r--r--pkg/sentry/kernel/task_run.go19
-rw-r--r--pkg/sentry/kernel/task_signals.go10
-rw-r--r--pkg/sentry/kernel/task_start.go6
-rw-r--r--pkg/sentry/kernel/task_stop.go16
-rw-r--r--pkg/sentry/kernel/task_work.go38
-rw-r--r--pkg/sentry/kernel/thread_group.go7
-rw-r--r--pkg/sentry/kernel/threads.go7
-rw-r--r--pkg/sentry/kernel/time/BUILD1
-rw-r--r--pkg/sentry/kernel/time/tcpip.go131
-rw-r--r--pkg/sentry/kernel/timekeeper.go9
-rw-r--r--pkg/sentry/kernel/vdso.go6
-rw-r--r--pkg/sentry/loader/BUILD4
-rw-r--r--pkg/sentry/loader/elf.go19
-rw-r--r--pkg/sentry/loader/loader.go27
-rw-r--r--pkg/sentry/loader/vdso.go61
-rw-r--r--pkg/sentry/memmap/BUILD14
-rw-r--r--pkg/sentry/memmap/memmap.go69
-rw-r--r--pkg/sentry/mm/BUILD4
-rw-r--r--pkg/sentry/mm/aio_context.go9
-rw-r--r--pkg/sentry/mm/lifecycle.go4
-rw-r--r--pkg/sentry/mm/metadata.go5
-rw-r--r--pkg/sentry/mm/mm.go10
-rw-r--r--pkg/sentry/mm/pma.go25
-rw-r--r--pkg/sentry/mm/special_mappable.go11
-rw-r--r--pkg/sentry/mm/syscalls.go4
-rw-r--r--pkg/sentry/mm/vma.go11
-rw-r--r--pkg/sentry/pgalloc/BUILD29
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go280
-rw-r--r--pkg/sentry/pgalloc/pgalloc_test.go206
-rw-r--r--pkg/sentry/pgalloc/save_restore.go13
-rw-r--r--pkg/sentry/platform/BUILD20
-rw-r--r--pkg/sentry/platform/kvm/BUILD5
-rw-r--r--pkg/sentry/platform/kvm/address_space.go82
-rw-r--r--pkg/sentry/platform/kvm/bluepill_allocator.go (renamed from pkg/sentry/platform/kvm/allocator.go)52
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go32
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go19
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go36
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go70
-rw-r--r--pkg/sentry/platform/kvm/context.go10
-rw-r--r--pkg/sentry/platform/kvm/kvm_amd64_test.go51
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64.go15
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64_unsafe.go4
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go18
-rw-r--r--pkg/sentry/platform/kvm/kvm_const_arm64.go19
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go24
-rw-r--r--pkg/sentry/platform/kvm/machine.go52
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go34
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go2
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go80
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go50
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go4
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.go4
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.s12
-rw-r--r--pkg/sentry/platform/platform.go100
-rw-r--r--pkg/sentry/platform/ptrace/BUILD2
-rw-r--r--pkg/sentry/platform/ptrace/ptrace.go10
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go9
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_unsafe.go2
-rw-r--r--pkg/sentry/platform/ring0/aarch64.go35
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s98
-rw-r--r--pkg/sentry/platform/ring0/kernel.go24
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go12
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go12
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.go6
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.s86
-rw-r--r--pkg/sentry/platform/ring0/pagetables/allocator.go11
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go8
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go9
-rw-r--r--pkg/sentry/socket/BUILD1
-rw-r--r--pkg/sentry/socket/control/control.go8
-rw-r--r--pkg/sentry/socket/control/control_vfs2.go8
-rw-r--r--pkg/sentry/socket/hostinet/BUILD3
-rw-r--r--pkg/sentry/socket/hostinet/socket.go25
-rw-r--r--pkg/sentry/socket/hostinet/socket_vfs2.go21
-rw-r--r--pkg/sentry/socket/hostinet/stack.go11
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go417
-rw-r--r--pkg/sentry/socket/netfilter/owner_matcher.go2
-rw-r--r--pkg/sentry/socket/netfilter/targets.go249
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go38
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go38
-rw-r--r--pkg/sentry/socket/netlink/BUILD3
-rw-r--r--pkg/sentry/socket/netlink/provider.go2
-rw-r--r--pkg/sentry/socket/netlink/socket.go28
-rw-r--r--pkg/sentry/socket/netlink/socket_vfs2.go22
-rw-r--r--pkg/sentry/socket/netstack/BUILD4
-rw-r--r--pkg/sentry/socket/netstack/netstack.go476
-rw-r--r--pkg/sentry/socket/netstack/netstack_vfs2.go31
-rw-r--r--pkg/sentry/socket/netstack/stack.go47
-rw-r--r--pkg/sentry/socket/socket.go8
-rw-r--r--pkg/sentry/socket/unix/BUILD2
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go15
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go12
-rw-r--r--pkg/sentry/socket/unix/transport/queue.go13
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go48
-rw-r--r--pkg/sentry/socket/unix/unix.go78
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go38
-rw-r--r--pkg/sentry/state/state.go1
-rw-r--r--pkg/sentry/strace/epoll.go10
-rw-r--r--pkg/sentry/strace/socket.go1
-rw-r--r--pkg/sentry/strace/strace.go12
-rw-r--r--pkg/sentry/syscalls/epoll.go18
-rw-r--r--pkg/sentry/syscalls/linux/BUILD2
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_aio.go173
-rw-r--r--pkg/sentry/syscalls/linux/sys_eventfd.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go169
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go54
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_inotify.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_lseek.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_mount.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_signal.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go63
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go18
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_sync.go8
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_timerfd.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_xattr.go20
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD11
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/aio.go219
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/epoll.go14
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/eventfd.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/execve.go12
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go197
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/filesystem.go38
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fscontext.go22
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/getdents.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/inotify.go137
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/ioctl.go74
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/lock.go64
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/memfd.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mmap.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mount.go150
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/path.go12
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/pipe.go6
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/poll.go10
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/read_write.go91
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/setstat.go72
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/signal.go4
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go63
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go490
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/stat.go30
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/sync.go48
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/timerfd.go8
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go132
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/xattr.go16
-rw-r--r--pkg/sentry/time/muldiv_arm64.s3
-rw-r--r--pkg/sentry/time/parameters.go12
-rw-r--r--pkg/sentry/time/parameters_test.go15
-rw-r--r--pkg/sentry/vfs/BUILD16
-rw-r--r--pkg/sentry/vfs/README.md6
-rw-r--r--pkg/sentry/vfs/anonfs.go19
-rw-r--r--pkg/sentry/vfs/dentry.go78
-rw-r--r--pkg/sentry/vfs/epoll.go11
-rw-r--r--pkg/sentry/vfs/file_description.go180
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go86
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go27
-rw-r--r--pkg/sentry/vfs/filesystem.go8
-rw-r--r--pkg/sentry/vfs/g3doc/inotify.md210
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go3
-rw-r--r--pkg/sentry/vfs/inotify.go774
-rw-r--r--pkg/sentry/vfs/lock.go (renamed from pkg/sentry/vfs/lock/lock.go)43
-rw-r--r--pkg/sentry/vfs/lock/BUILD13
-rw-r--r--pkg/sentry/vfs/mount.go150
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go2
-rw-r--r--pkg/sentry/vfs/options.go21
-rw-r--r--pkg/sentry/vfs/pathname.go18
-rw-r--r--pkg/sentry/vfs/permissions.go53
-rw-r--r--pkg/sentry/vfs/resolving_path.go43
-rw-r--r--pkg/sentry/vfs/vfs.go171
-rw-r--r--pkg/sentry/watchdog/watchdog.go14
414 files changed, 15783 insertions, 4465 deletions
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 343f81f59..0f433ee79 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,7 +17,6 @@
package arch
import (
- "encoding/binary"
"fmt"
"io"
@@ -29,7 +28,14 @@ import (
)
// Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+ linux.PtraceRegs
+
+ // TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register.
+ TPIDR_EL0 uint64
+}
const (
// SyscallWidth is the width of insturctions.
@@ -49,9 +55,14 @@ const ARMTrapFlag = uint64(1) << 21
type aarch64FPState []byte
// initAarch64FPState sets up initial state.
+//
+// Related code in Linux kernel: fpsimd_flush_thread().
+// FPCR = FPCR_RM_RN (0x0 << 22).
+//
+// Currently, aarch64FPState is only a space of 0x210 length for fpstate.
+// The fp head is useless in sentry/ptrace/kvm.
+//
func initAarch64FPState(data aarch64FPState) {
- binary.LittleEndian.PutUint32(data, fpsimdMagic)
- binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
}
func newAarch64FPStateSlice() []byte {
@@ -90,6 +101,8 @@ func NewFloatingPointData() *FloatingPointData {
// State contains the common architecture bits for aarch64 (the build tag of this
// file ensures it's only built on aarch64).
+//
+// +stateify savable
type State struct {
// The system registers.
Regs Registers
@@ -97,9 +110,6 @@ type State struct {
// Our floating point state.
aarch64FPState `state:"wait"`
- // TLS pointer
- TPValue uint64
-
// FeatureSet is a pointer to the currently active feature set.
FeatureSet *cpuid.FeatureSet
@@ -153,7 +163,6 @@ func (s *State) Fork() State {
return State{
Regs: s.Regs,
aarch64FPState: s.aarch64FPState.fork(),
- TPValue: s.TPValue,
FeatureSet: s.FeatureSet,
OrigR0: s.OrigR0,
}
@@ -237,18 +246,18 @@ func (s *State) ptraceGetRegs() Registers {
return s.Regs
}
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
// PtraceSetRegs implements Context.PtraceSetRegs.
func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
var regs Registers
- buf := make([]byte, registersSize)
+ buf := make([]byte, ptraceRegistersSize)
if _, err := io.ReadFull(src, buf); err != nil {
return 0, err
}
regs.UnmarshalUnsafe(buf)
s.Regs = regs
- return registersSize, nil
+ return ptraceRegistersSize, nil
}
// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
@@ -274,7 +283,7 @@ const (
func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceGetRegs(dst)
@@ -287,7 +296,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 3b3a0a272..1c3e3c14c 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -300,7 +300,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
// u_debugreg, returning 0 or silently no-oping for other fields
// respectively.
- if addr < uintptr(registersSize) {
+ if addr < uintptr(ptraceRegistersSize) {
regs := c.ptraceGetRegs()
buf := make([]byte, regs.SizeBytes())
regs.MarshalUnsafe(buf)
@@ -315,7 +315,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
if addr&7 != 0 || addr >= userStructSize {
return syscall.EIO
}
- if addr < uintptr(registersSize) {
+ if addr < uintptr(ptraceRegistersSize) {
regs := c.ptraceGetRegs()
buf := make([]byte, regs.SizeBytes())
regs.MarshalUnsafe(buf)
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index ada7ac7b8..550741d8c 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -73,6 +73,8 @@ const (
)
// context64 represents an ARM64 context.
+//
+// +stateify savable
type context64 struct {
State
sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
@@ -142,7 +144,7 @@ func (c *context64) SetStack(value uintptr) {
// TLS returns the current TLS pointer.
func (c *context64) TLS() uintptr {
- return uintptr(c.TPValue)
+ return uintptr(c.Regs.TPIDR_EL0)
}
// SetTLS sets the current TLS pointer. Returns false if value is invalid.
@@ -151,7 +153,7 @@ func (c *context64) SetTLS(value uintptr) bool {
return false
}
- c.TPValue = uint64(value)
+ c.Regs.TPIDR_EL0 = uint64(value)
return true
}
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index dc458b37f..b9405b320 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -31,7 +31,11 @@ import (
)
// Registers represents the CPU registers for this architecture.
-type Registers = linux.PtraceRegs
+//
+// +stateify savable
+type Registers struct {
+ linux.PtraceRegs
+}
// System-related constants for x86.
const (
@@ -311,12 +315,12 @@ func (s *State) ptraceGetRegs() Registers {
return regs
}
-var registersSize = (*Registers)(nil).SizeBytes()
+var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
// PtraceSetRegs implements Context.PtraceSetRegs.
func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
var regs Registers
- buf := make([]byte, registersSize)
+ buf := make([]byte, ptraceRegistersSize)
if _, err := io.ReadFull(src, buf); err != nil {
return 0, err
}
@@ -374,7 +378,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
}
regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
s.Regs = regs
- return registersSize, nil
+ return ptraceRegistersSize, nil
}
// isUserSegmentSelector returns true if the given segment selector specifies a
@@ -543,7 +547,7 @@ const (
func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceGetRegs(dst)
@@ -563,7 +567,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
switch regset {
case _NT_PRSTATUS:
- if maxlen < registersSize {
+ if maxlen < ptraceRegistersSize {
return 0, syserror.EFAULT
}
return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e74275d2d..2c5d14be5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,14 +16,12 @@ go_library(
],
deps = [
"//pkg/abi/linux",
- "//pkg/context",
"//pkg/fd",
- "//pkg/fspath",
"//pkg/log",
"//pkg/sentry/fdimport",
"//pkg/sentry/fs",
"//pkg/sentry/fs/host",
- "//pkg/sentry/fsbridge",
+ "//pkg/sentry/fs/user",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
@@ -35,7 +33,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/tcpip/link/sniffer",
"//pkg/urpc",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go
index 811f24324..8a500a515 100644
--- a/pkg/sentry/control/logging.go
+++ b/pkg/sentry/control/logging.go
@@ -70,8 +70,8 @@ type LoggingArgs struct {
type Logging struct{}
// Change will change the log level and strace arguments. Although
-// this functions signature requires an error it never acctually
-// return san error. It's required by the URPC interface.
+// this functions signature requires an error it never actually
+// returns an error. It's required by the URPC interface.
// Additionally, it may look odd that this is the only method
// attached to an empty struct but this is also part of how
// URPC dispatches.
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 663e51989..2bf3c45e1 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -49,6 +49,9 @@ type ProfileOpts struct {
// - dump out the stack trace of current go routines.
// sentryctl -pid <pid> pprof-goroutine
type Profile struct {
+ // Kernel is the kernel under profile. It's immutable.
+ Kernel *kernel.Kernel
+
// mu protects the fields below.
mu sync.Mutex
@@ -57,9 +60,6 @@ type Profile struct {
// traceFile is the current execution trace output file.
traceFile *fd.FD
-
- // Kernel is the kernel under profile.
- Kernel *kernel.Kernel
}
// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 2ed17ee09..dfa936563 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,7 +18,6 @@ import (
"bytes"
"encoding/json"
"fmt"
- "path"
"sort"
"strings"
"text/tabwriter"
@@ -26,13 +25,10 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
- "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/fs/user"
hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -40,7 +36,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/urpc"
)
@@ -108,6 +103,9 @@ type ExecArgs struct {
// String prints the arguments as a string.
func (args ExecArgs) String() string {
+ if len(args.Argv) == 0 {
+ return args.Filename
+ }
a := make([]string, len(args.Argv))
copy(a, args.Argv)
if args.Filename != "" {
@@ -141,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID
func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
// Import file descriptors.
fdTable := proc.Kernel.NewFDTable()
- defer fdTable.DecRef()
creds := auth.NewUserCredentials(
args.KUID,
@@ -179,43 +176,32 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
initArgs.MountNamespaceVFS2.IncRef()
}
ctx := initArgs.NewContext(proc.Kernel)
+ defer fdTable.DecRef(ctx)
+
+ if kernel.VFS2Enabled {
+ // Get the full path to the filename from the PATH env variable.
+ if initArgs.MountNamespaceVFS2 == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ //
+ // MountNamespaceVFS2 adds a reference to the namespace, which is
+ // transferred to the new process.
+ initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+ }
+ } else {
+ if initArgs.MountNamespace == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
- if initArgs.Filename == "" {
- if kernel.VFS2Enabled {
- // Get the full path to the filename from the PATH env variable.
- if initArgs.MountNamespaceVFS2 == nil {
- // Set initArgs so that 'ctx' returns the namespace.
- //
- // MountNamespaceVFS2 adds a reference to the namespace, which is
- // transferred to the new process.
- initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
- }
-
- paths := fs.GetPath(initArgs.Envv)
- vfsObj := proc.Kernel.VFS()
- file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
- }
- initArgs.File = fsbridge.NewVFSFile(file)
- } else {
- // Get the full path to the filename from the PATH env variable.
- paths := fs.GetPath(initArgs.Envv)
- if initArgs.MountNamespace == nil {
- // Set initArgs so that 'ctx' returns the namespace.
- initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
-
- // initArgs must hold a reference on MountNamespace, which will
- // be donated to the new process in CreateProcess.
- initArgs.MountNamespace.IncRef()
- }
- f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
- if err != nil {
- return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
- }
- initArgs.Filename = f
+ // initArgs must hold a reference on MountNamespace, which will
+ // be donated to the new process in CreateProcess.
+ initArgs.MountNamespace.IncRef()
}
}
+ resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
+ if err != nil {
+ return nil, 0, nil, nil, err
+ }
+ initArgs.Filename = resolved
fds := make([]int, len(args.FilePayload.Files))
for i, file := range args.FilePayload.Files {
@@ -428,67 +414,3 @@ func ttyName(tty *kernel.TTY) string {
}
return fmt.Sprintf("pts/%d", tty.Index)
}
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
- root := vfs.RootFromContext(ctx)
- defer root.DecRef()
- creds := auth.CredentialsFromContext(ctx)
-
- // Absolute paths can be used directly.
- if path.IsAbs(name) {
- return openExecutable(ctx, vfsObj, creds, root, name)
- }
-
- // Paths with '/' in them should be joined to the working directory, or
- // to the root if working directory is not set.
- if strings.IndexByte(name, '/') > 0 {
- if len(wd) == 0 {
- wd = "/"
- }
- if !path.IsAbs(wd) {
- return nil, fmt.Errorf("working directory %q must be absolute", wd)
- }
- return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
- }
-
- // Otherwise, we must lookup the name in the paths, starting from the
- // calling context's root directory.
- for _, p := range paths {
- if !path.IsAbs(p) {
- // Relative paths aren't safe, no one should be using them.
- log.Warningf("Skipping relative path %q in $PATH", p)
- continue
- }
-
- binPath := path.Join(p, name)
- f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
- if err != nil {
- return nil, err
- }
- if f == nil {
- continue // Not found/no access.
- }
- return f, nil
- }
- return nil, syserror.ENOENT
-}
-
-func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
- pop := vfs.PathOperation{
- Root: root,
- Start: root, // binPath is absolute, Start can be anything.
- Path: fspath.Parse(path),
- FollowFinalSymlink: true,
- }
- opts := &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- FileExec: true,
- }
- f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
- if err == syserror.ENOENT || err == syserror.EACCES {
- return nil, nil
- }
- return f, err
-}
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 69e71e322..f45b2bd2b 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -188,6 +188,9 @@ type MultiDevice struct {
// String stringifies MultiDevice.
func (m *MultiDevice) String() string {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
buf := bytes.NewBuffer(nil)
buf.WriteString("cache{")
for k, v := range m.cache {
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index c7e197691..511179e31 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -42,10 +42,11 @@ type fullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *fullFD) Release() {
+func (fd *fullFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 33d060d02..4918dbeeb 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -43,10 +43,11 @@ type nullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nullFD) Release() {
+func (fd *nullFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index acfa23149..5e7fe0280 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -48,6 +48,7 @@ type randomFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// off is the "file offset". off is accessed using atomic memory
// operations.
@@ -55,7 +56,7 @@ type randomFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *randomFD) Release() {
+func (fd *randomFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 3b1372b9e..2e631a252 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -44,10 +44,11 @@ type zeroFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *zeroFD) Release() {
+func (fd *zeroFD) Release(context.Context) {
// noop
}
diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD
new file mode 100644
index 000000000..12e49b58a
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "ttydev",
+ srcs = ["ttydev.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/vfs",
+ "//pkg/usermem",
+ ],
+)
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
new file mode 100644
index 000000000..fd4b79c46
--- /dev/null
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ttydev implements devices for /dev/tty and (eventually)
+// /dev/console.
+//
+// TODO(b/159623826): Support /dev/console.
+package ttydev
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ // See drivers/tty/tty_io.c:tty_init().
+ ttyDevMinor = 0
+ consoleDevMinor = 1
+)
+
+// ttyDevice implements vfs.Device for /dev/tty.
+type ttyDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd := &ttyFD{}
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// ttyFD implements vfs.FileDescriptionImpl for /dev/tty.
+type ttyFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *ttyFD) Release(context.Context) {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return src.NumBytes(), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return src.NumBytes(), nil
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ return vfsObj.RegisterDevice(vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, ttyDevice{}, &vfs.RegisterDeviceOptions{
+ GroupName: "tty",
+ })
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+ return dev.CreateDeviceFile(ctx, "tty", vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD
new file mode 100644
index 000000000..71c59287c
--- /dev/null
+++ b/pkg/sentry/devices/tundev/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "tundev",
+ srcs = ["tundev.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/inet",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/socket/netstack",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/tcpip/link/tun",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
new file mode 100644
index 000000000..a40625e19
--- /dev/null
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -0,0 +1,178 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tundev implements the /dev/net/tun device.
+package tundev
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/tcpip/link/tun"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+ netTunDevMajor = 10
+ netTunDevMinor = 200
+)
+
+// tunDevice implements vfs.Device for /dev/net/tun.
+type tunDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd := &tunFD{}
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun.
+type tunFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ device tun.Device
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ request := args[1].Uint()
+ data := args[2].Pointer()
+
+ switch request {
+ case linux.TUNSETIFF:
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("Ioctl should be called from a task context")
+ }
+ if !t.HasCapability(linux.CAP_NET_ADMIN) {
+ return 0, syserror.EPERM
+ }
+ stack, ok := t.NetworkContext().(*netstack.Stack)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+
+ var req linux.IFReq
+ if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+ flags := usermem.ByteOrder.Uint16(req.Data[:])
+ return 0, fd.device.SetIff(stack.Stack, req.Name(), flags)
+
+ case linux.TUNGETIFF:
+ var req linux.IFReq
+
+ copy(req.IFName[:], fd.device.Name())
+
+ // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+ // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+ flags := fd.device.Flags() | linux.IFF_NOFILTER
+ usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+ _, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *tunFD) Release(ctx context.Context) {
+ fd.device.Release(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return fd.Read(ctx, dst, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ data, err := fd.device.Read()
+ if err != nil {
+ return 0, err
+ }
+ n, err := dst.CopyOut(ctx, data)
+ if n > 0 && n < len(data) {
+ // Not an error for partial copying. Packet truncated.
+ err = nil
+ }
+ return int64(n), err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return fd.Write(ctx, src, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ data := make([]byte, src.NumBytes())
+ if _, err := src.CopyIn(ctx, data); err != nil {
+ return 0, err
+ }
+ return fd.device.Write(data)
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fd *tunFD) EventUnregister(e *waiter.Entry) {
+ fd.device.EventUnregister(e)
+}
+
+// IsNetTunSupported returns whether /dev/net/tun device is supported for s.
+func IsNetTunSupported(s inet.Stack) bool {
+ _, ok := s.(*netstack.Stack)
+ return ok
+}
+
+// Register registers all devices implemented by this package in vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{})
+}
+
+// CreateDevtmpfsFiles creates device special files in dev representing all
+// devices implemented by this package.
+func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
+ return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */)
+}
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index a4199f9e9..1b7cb94c0 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -15,6 +15,8 @@
package fdimport
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -48,7 +50,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
// Remember this in the TTY file, as we will
// use it for the other stdio FDs.
@@ -67,7 +69,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
}
// Add the file to the FD map.
@@ -84,6 +86,9 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ return nil, fmt.Errorf("cannot find kernel from context")
+ }
var ttyFile *vfs.FileDescription
for appFD, hostFD := range stdioFDs {
@@ -97,7 +102,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
// Remember this in the TTY file, as we will use it for the other stdio
// FDs.
@@ -114,7 +119,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
if err != nil {
return nil, err
}
- defer appFile.DecRef()
+ defer appFile.DecRef(ctx)
}
if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ab1424c95..735452b07 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -201,7 +201,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
parentUpper := parent.Inode.overlay.upper
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
// Create the file in the upper filesystem and get an Inode for it.
@@ -212,7 +212,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
log.Warningf("copy up failed to create file: %v", err)
return syserror.EIO
}
- defer childFile.DecRef()
+ defer childFile.DecRef(ctx)
childUpperInode = childFile.Dirent.Inode
case Directory:
@@ -226,7 +226,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
cleanupUpper(ctx, parentUpper, next.name, werr)
return syserror.EIO
}
- defer childUpper.DecRef()
+ defer childUpper.DecRef(ctx)
childUpperInode = childUpper.Inode
case Symlink:
@@ -246,7 +246,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
cleanupUpper(ctx, parentUpper, next.name, werr)
return syserror.EIO
}
- defer childUpper.DecRef()
+ defer childUpper.DecRef(ctx)
childUpperInode = childUpper.Inode
default:
@@ -352,14 +352,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in
if err != nil {
return err
}
- defer upperFile.DecRef()
+ defer upperFile.DecRef(ctx)
// Get a handle to the lower filesystem, which we will read from.
lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true})
if err != nil {
return err
}
- defer lowerFile.DecRef()
+ defer lowerFile.DecRef(ctx)
// Use a buffer pool to minimize allocations.
buf := copyUpBuffers.Get().([]byte)
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 91792d9fe..c7a11eec1 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
if err != nil {
t.Fatalf("failed to create file %q: %v", name, err)
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
relname, _ := f.Dirent.FullName(lowerRoot)
@@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
if err != nil {
t.Fatalf("failed to find %q: %v", f.name, err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
if err != nil {
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index dc7ad075a..ec474e554 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -80,8 +80,8 @@ type netTunFileOperations struct {
var _ fs.FileOperations = (*netTunFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (fops *netTunFileOperations) Release() {
- fops.device.Release()
+func (fops *netTunFileOperations) Release(ctx context.Context) {
+ fops.device.Release(ctx)
}
// Ioctl implements fs.FileOperations.Ioctl.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 65be12175..a2f751068 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) {
for _, w := range d.children {
if child := w.Get(); child != nil {
child.(*Dirent).SyncAll(ctx)
- child.DecRef()
+ child.DecRef(ctx)
}
}
}
@@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// which don't hold a hard reference on their parent (their parent holds a
// hard reference on them, and they contain virtually no state). But this is
// good house-keeping.
- child.DecRef()
+ child.DecRef(ctx)
return nil, syscall.ENOENT
}
@@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// their pins on the child. Inotify doesn't properly support filesystems that
// revalidate dirents (since watches are lost on revalidation), but if we fail
// to unpin the watches child will never be GCed.
- cd.Inode.Watches.Unpin(cd)
+ cd.Inode.Watches.Unpin(ctx, cd)
// This child needs to be revalidated, fallthrough to unhash it. Make sure
// to not leak a reference from Get().
//
// Note that previous lookups may still have a reference to this stale child;
// this can't be helped, but we can ensure that *new* lookups are up-to-date.
- child.DecRef()
+ child.DecRef(ctx)
}
// Either our weak reference expired or we need to revalidate it. Unhash child first, we're
// about to replace it.
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
@@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// There are active references to the existing child, prefer it to the one we
// retrieved from Lookup. Likely the Lookup happened very close to the insertion
// of child, so considering one stale over the other is fairly arbitrary.
- c.DecRef()
+ c.DecRef(ctx)
// The child that was installed could be negative.
if cd.IsNegative() {
// If so, don't leak a reference and short circuit.
- child.DecRef()
+ child.DecRef(ctx)
return nil, syscall.ENOENT
}
@@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
// we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
// we looked up.
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Give the looked up child a parent. We cannot kick out entries, since we just checked above
@@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
return false
}
// Child exists.
- child.DecRef()
+ child.DecRef(ctx)
return true
}
@@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
}
child := file.Dirent
- d.finishCreate(child, name)
+ d.finishCreate(ctx, child, name)
// Return the reference and the new file. When the last reference to
// the file is dropped, file.Dirent may no longer be cached.
@@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
// finishCreate validates the created file, adds it as a child of this dirent,
// and notifies any watchers.
-func (d *Dirent) finishCreate(child *Dirent, name string) {
+func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) {
// Sanity check c, its name must be consistent.
if child.name != name {
panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
@@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) {
panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
}
// Don't leak a reference.
- old.DecRef()
+ old.DecRef(ctx)
// Drop d's reference.
- old.DecRef()
+ old.DecRef(ctx)
}
// Finally drop the useless weak reference on the floor.
- w.Drop()
+ w.Drop(ctx)
}
d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
@@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
}
// Don't leak a reference.
- old.DecRef()
+ old.DecRef(ctx)
// Drop d's reference.
- old.DecRef()
+ old.DecRef(ctx)
}
// Unhash the negative Dirent, name needs to exist now.
delete(d.children, name)
// Finally drop the useless weak reference on the floor.
- w.Drop()
+ w.Drop(ctx)
}
// Execute the create operation.
@@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans
if e != nil {
return e
}
- d.finishCreate(childDir, name)
+ d.finishCreate(ctx, childDir, name)
return nil
})
if err == syscall.EEXIST {
@@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
// references to children.
//
// Preconditions: d.mu must be held.
-func (d *Dirent) flush() {
+func (d *Dirent) flush(ctx context.Context) {
expired := make(map[string]*refs.WeakRef)
for n, w := range d.children {
// Call flush recursively on each child before removing our
@@ -912,7 +912,7 @@ func (d *Dirent) flush() {
if !cd.IsNegative() {
// Flush the child.
cd.mu.Lock()
- cd.flush()
+ cd.flush(ctx)
cd.mu.Unlock()
// Allow the file system to drop extra references on child.
@@ -920,13 +920,13 @@ func (d *Dirent) flush() {
}
// Don't leak a reference.
- child.DecRef()
+ child.DecRef(ctx)
}
// Check if the child dirent is closed, and mark it as expired if it is.
// We must call w.Get() again here, since the child could have been closed
// by the calls to flush() and cache.Remove() in the above if-block.
if child := w.Get(); child != nil {
- child.DecRef()
+ child.DecRef(ctx)
} else {
expired[n] = w
}
@@ -935,7 +935,7 @@ func (d *Dirent) flush() {
// Remove expired entries.
for n, w := range expired {
delete(d.children, n)
- w.Drop()
+ w.Drop(ctx)
}
}
@@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err
if !ok {
panic("mount must mount over an existing dirent")
}
- weakRef.Drop()
+ weakRef.Drop(ctx)
// Note that even though `d` is now hidden, it still holds a reference
// to its parent.
@@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
if !ok {
panic("mount must mount over an existing dirent")
}
- weakRef.Drop()
+ weakRef.Drop(ctx)
// d is not reachable anymore, and hence not mounted anymore.
d.mounted = false
// Drop mount reference.
- d.DecRef()
+ d.DecRef(ctx)
return nil
}
@@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
// Child does not exist.
return err
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
// Remove cannot remove directories.
if IsDir(child.Inode.StableAttr) {
@@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Allow the file system to drop extra references on child.
@@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
// inode may have other links. If this was the last link, the events for the
// watch removal will be queued by the inode destructor.
child.Inode.Watches.MarkUnlinked()
- child.Inode.Watches.Unpin(child)
+ child.Inode.Watches.Unpin(ctx, child)
d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
return nil
@@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
// Child does not exist.
return err
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
// RemoveDirectory can only remove directories.
if !IsDir(child.Inode.StableAttr) {
@@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
- w.Drop()
+ w.Drop(ctx)
}
// Allow the file system to drop extra references on child.
@@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
// Finally, let inotify know the child is being unlinked. Drop any extra
// refs from inotify to this child dirent.
child.Inode.Watches.MarkUnlinked()
- child.Inode.Watches.Unpin(child)
+ child.Inode.Watches.Unpin(ctx, child)
d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
return nil
}
// destroy closes this node and all children.
-func (d *Dirent) destroy() {
+func (d *Dirent) destroy(ctx context.Context) {
if d.IsNegative() {
// Nothing to tear-down and no parent references to drop, since a negative
// Dirent does not take a references on its parent, has no Inode and no children.
@@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() {
if c.(*Dirent).IsNegative() {
// The parent holds both weak and strong refs in the case of
// negative dirents.
- c.DecRef()
+ c.DecRef(ctx)
}
// Drop the reference we just acquired in WeakRef.Get.
- c.DecRef()
+ c.DecRef(ctx)
}
- w.Drop()
+ w.Drop(ctx)
}
d.children = nil
allDirents.remove(d)
// Drop our reference to the Inode.
- d.Inode.DecRef()
+ d.Inode.DecRef(ctx)
// Allow the Dirent to be GC'ed after this point, since the Inode may still
// be referenced after the Dirent is destroyed (for instance by filesystem
@@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() {
// Drop the reference we have on our parent if we took one. renameMu doesn't need to be
// held because d can't be reparented without any references to it left.
if d.parent != nil {
- d.parent.DecRef()
+ d.parent.DecRef(ctx)
}
}
@@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool {
// DecRef decreases the Dirent's refcount and drops its reference on its mount.
//
// DecRef implements RefCounter.DecRef with destructor d.destroy.
-func (d *Dirent) DecRef() {
+func (d *Dirent) DecRef(ctx context.Context) {
if d.Inode != nil {
// Keep mount around, since DecRef may destroy d.Inode.
msrc := d.Inode.MountSource
- d.DecRefWithDestructor(d.destroy)
+ d.DecRefWithDestructor(ctx, d.destroy)
msrc.DecDirentRefs()
} else {
- d.DecRefWithDestructor(d.destroy)
+ d.DecRefWithDestructor(ctx, d.destroy)
}
}
@@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error
if err != nil {
return err
}
- defer victim.DecRef()
+ defer victim.DecRef(ctx)
return d.mayDelete(ctx, victim)
}
@@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
if err != nil {
return err
}
- defer renamed.DecRef()
+ defer renamed.DecRef(ctx)
// Check that the renamed dirent is deletable.
if err := oldParent.mayDelete(ctx, renamed); err != nil {
@@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Check that we can delete replaced.
if err := newParent.mayDelete(ctx, replaced); err != nil {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return err
}
// Target should not be an ancestor of source.
if oldParent.descendantOf(replaced) {
- replaced.DecRef()
+ replaced.DecRef(ctx)
// Note that Linux returns EINVAL if the source is an
// ancestor of target, but ENOTEMPTY if the target is
@@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Check that replaced is not a mount point.
if replaced.isMountPointLocked() {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.EBUSY
}
@@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
oldIsDir := IsDir(renamed.Inode.StableAttr)
newIsDir := IsDir(replaced.Inode.StableAttr)
if !newIsDir && oldIsDir {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.ENOTDIR
}
if !oldIsDir && newIsDir {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syscall.EISDIR
}
@@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// open across renames is currently broken for multiple
// reasons, so we flush all references on the replaced node and
// its children.
- replaced.Inode.Watches.Unpin(replaced)
+ replaced.Inode.Watches.Unpin(ctx, replaced)
replaced.mu.Lock()
- replaced.flush()
+ replaced.flush(ctx)
replaced.mu.Unlock()
// Done with replaced.
- replaced.DecRef()
+ replaced.DecRef(ctx)
}
if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil {
@@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// can't destroy oldParent (and try to retake its lock) because
// Rename's caller must be holding a reference.
newParent.IncRef()
- oldParent.DecRef()
+ oldParent.DecRef(ctx)
}
if w, ok := newParent.children[newName]; ok {
- w.Drop()
+ w.Drop(ctx)
delete(newParent.children, newName)
}
if w, ok := oldParent.children[oldName]; ok {
- w.Drop()
+ w.Drop(ctx)
delete(oldParent.children, oldName)
}
@@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
// Same as replaced.flush above.
renamed.mu.Lock()
- renamed.flush()
+ renamed.flush(ctx)
renamed.mu.Unlock()
return nil
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 33de32c69..7d9dd717e 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -17,6 +17,7 @@ package fs
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) {
panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
}
c.list.Remove(d)
- d.DecRef()
+ d.DecRef(context.Background())
c.currentSize--
if c.limit != nil {
c.limit.dec()
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 98d69c6f2..176b894ba 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) {
t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1)
}
- d.DecRef()
+ d.DecRef(ctx)
if got := root.ReadRefs(); got != 1 {
t.Fatalf("root has a ref count of %d, want %d", got, 1)
@@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) {
t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
}
- root.flush()
+ root.flush(ctx)
if got := len(root.children); got != 0 {
t.Fatalf("root has %d children, want %d", got, 0)
@@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) {
t.Fatalf("child has a ref count of %d, want %d", got, 2)
}
- child.DecRef()
+ child.DecRef(ctx)
if got := child.(*Dirent).ReadRefs(); got != 1 {
t.Fatalf("child has a ref count of %d, want %d", got, 1)
@@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) {
t.Fatalf("root has %d children, want %d", got, 1)
}
- root.DecRef()
+ root.DecRef(ctx)
if got := root.ReadRefs(); got != 0 {
t.Fatalf("root has a ref count of %d, want %d", got, 0)
@@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) {
t.Fatalf("dirent has a ref count of %d, want %d", got, 1)
}
- d.DecRef()
+ d.DecRef(ctx)
- test.root.flush()
+ test.root.flush(ctx)
if got := len(test.root.children); got != 0 {
t.Errorf("root has %d children, want %d", got, 0)
@@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) {
t.Fatalf("Rename got error %v, want nil", err)
}
- oldParent.flush()
- newParent.flush()
+ oldParent.flush(ctx)
+ newParent.flush(ctx)
// Expect to have only active references.
if got := renamed.ReadRefs(); got != 1 {
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index f623d6c0e..67a35f0b2 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -18,6 +18,7 @@ import (
"fmt"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
)
@@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent {
for name, w := range d.children {
if rc := w.Get(); rc != nil {
// Drop the reference count obtain in w.Get()
- rc.DecRef()
+ rc.DecRef(context.Background())
cd := rc.(*Dirent)
if cd.IsNegative() {
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 9fce177ad..b99199798 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even
}
// Release implements fs.FileOperations.Release.
-func (p *pipeOperations) Release() {
+func (p *pipeOperations) Release(context.Context) {
fdnotifier.RemoveFD(int32(p.file.FD()))
p.file.Close()
p.file = nil
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index e556da48a..b9cec4b13 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) {
// Cleanup the state of the pipe, and remove the fd from the
// fdnotifier. Sadly this needed to maintain the correctness
// of other tests because the fdnotifier is global.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
continue
}
@@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) {
}
if pipeOps != nil {
// Same as above.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
}
}
@@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) {
pipeOps, err := Open(ctx, opener, flags)
if pipeOps != nil {
// Same as TestTryOpen.
- pipeOps.Release()
+ pipeOps.Release(ctx)
}
// Check that the partner opened the file successfully.
@@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) {
ctx := contexttest.Context(t)
pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true})
if pipeOps != nil {
- pipeOps.Release()
+ pipeOps.Release(ctx)
t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY)
}
if err != syserror.ErrWouldBlock {
@@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) {
if pipeOps == nil {
t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY)
}
- defer pipeOps.Release()
+ defer pipeOps.Release(ctx)
if err != nil {
t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err)
@@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) {
f := <-fdchan
if f < 0 {
t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f)
- pipeOps.Release()
+ pipeOps.Release(ctx)
continue
}
if test.hangupSelf {
// Hangup self and assert that our partner got the expected hangup
// error.
- pipeOps.Release()
+ pipeOps.Release(ctx)
if test.flags.Read {
// Partner is writer.
@@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) {
} else {
// Hangup our partner and expect us to get the hangup error.
syscall.Close(f)
- defer pipeOps.Release()
+ defer pipeOps.Release(ctx)
if test.flags.Read {
assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file)
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index a0082ecca..1c9e82562 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) {
}
f := fd.New(gfd)
- p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer)
+ ctx := contexttest.Context(t)
+ p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer)
if p != nil {
// This is necessary to remove the fd from the global fd notifier.
- defer p.Release()
+ defer p.Release(ctx)
} else {
// If there is no p to DecRef on, because newPipeOperations failed, then the
// file still needs to be closed.
@@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) {
syscall.Close(fds[1])
// Test the read end, but it doesn't really matter which.
- p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil)
+ ctx := contexttest.Context(t)
+ p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil)
if err != nil {
f.Close()
t.Fatalf("newPipeOperations got error %v, want nil", err)
}
// Drop our only reference, which should trigger the destructor.
- p.Release()
+ p.Release(ctx)
if fdnotifier.HasFD(int32(fds[0])) {
t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0])
@@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) {
if err != nil {
t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err)
}
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p)
@@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) {
rfile.Close()
t.Fatalf("newPipeOperations got error %v, want nil", err)
}
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
@@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) {
}
// Don't forget to remove the fd from the fd notifier. Otherwise other tests will
// likely be borked, because it's global :(
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
@@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) {
}
// Don't forget to remove the fd from the fd notifier. Otherwise other tests
// will likely be borked, because it's global :(
- defer p.Release()
+ defer p.Release(ctx)
inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
Type: fs.Pipe,
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 846252c89..72ea70fcf 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper
}
// DecRef destroys the File when it is no longer referenced.
-func (f *File) DecRef() {
- f.DecRefWithDestructor(func() {
+func (f *File) DecRef(ctx context.Context) {
+ f.DecRefWithDestructor(ctx, func(context.Context) {
// Drop BSD style locks.
lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
- f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng)
+ f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng)
// Release resources held by the FileOperations.
- f.FileOperations.Release()
+ f.FileOperations.Release(ctx)
// Release a reference on the Dirent.
- f.Dirent.DecRef()
+ f.Dirent.DecRef(ctx)
// Only unregister if we are currently registered. There is nothing
// to register if f.async is nil (this happens when async mode is
@@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
if !f.mu.Lock(ctx) {
return 0, syserror.ErrInterrupted
}
-
unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
// Handle append mode.
if f.Flags().Append {
@@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
// offset."
unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
defer unlockAppendMu()
-
if f.Flags().Append {
if err := f.offsetForAppend(ctx, &offset); err != nil {
return 0, err
@@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
return f.FileOperations.Write(ctx, f, src, offset)
}
-// offsetForAppend sets the given offset to the end of the file.
+// offsetForAppend atomically sets the given offset to the end of the file.
//
-// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
+// writing.
func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
if err != nil {
@@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
}
// Update the offset.
- *offset = uattr.Size
+ atomic.StoreInt64(offset, uattr.Size)
return nil
}
@@ -461,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
func (f *File) MappedName(ctx context.Context) string {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
name, _ := f.Dirent.FullName(root)
return name
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index beba0f771..305c0f840 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -67,7 +67,7 @@ type SpliceOpts struct {
// - File.Flags(): This value may change during the operation.
type FileOperations interface {
// Release release resources held by FileOperations.
- Release()
+ Release(ctx context.Context)
// Waitable defines how this File can be waited on for read and
// write readiness.
@@ -160,6 +160,7 @@ type FileOperations interface {
// refer.
//
// Preconditions: The AddressSpace (if any) that io refers to is activated.
+ // Must only be called from a task goroutine.
Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
}
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index dcc1df38f..9dc58d5ff 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err
// Drop the extra reference on the Dirent. Now there's only one reference
// on the dirent, either owned by f (if non-nil), or the Dirent is about
// to be destroyed (if GetFile failed).
- dirent.DecRef()
+ dirent.DecRef(ctx)
return f, err
}
@@ -89,12 +89,12 @@ type overlayFileOperations struct {
}
// Release implements FileOperations.Release.
-func (f *overlayFileOperations) Release() {
+func (f *overlayFileOperations) Release(ctx context.Context) {
if f.upper != nil {
- f.upper.DecRef()
+ f.upper.DecRef(ctx)
}
if f.lower != nil {
- f.lower.DecRef()
+ f.lower.DecRef(ctx)
}
}
@@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &DirCtx{
@@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
if err != nil {
return nil, err
}
- defer dir.DecRef()
+ defer dir.DecRef(ctx)
// Use a stub serializer to read the entries into memory.
stubSerializer := &CollectEntriesSerializer{}
@@ -521,10 +521,10 @@ type overlayMappingIdentity struct {
}
// DecRef implements AtomicRefCount.DecRef.
-func (omi *overlayMappingIdentity) DecRef() {
- omi.AtomicRefCount.DecRefWithDestructor(func() {
- omi.overlayFile.DecRef()
- omi.id.DecRef()
+func (omi *overlayMappingIdentity) DecRef(ctx context.Context) {
+ omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+ omi.overlayFile.DecRef(ctx)
+ omi.id.DecRef(ctx)
})
}
@@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 {
func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string {
root := RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
name, _ := omi.overlayFile.Dirent.FullName(root)
return name
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 084da2a8d..d41f30bbb 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -87,20 +87,6 @@ func RegisterFilesystem(f Filesystem) {
filesystems.registered[f.Name()] = f
}
-// UnregisterFilesystem removes a file system from the global set. To keep the
-// file system set compatible with save/restore, UnregisterFilesystem must be
-// called before save/restore methods.
-//
-// For instance, packages may unregister their file system after it is mounted.
-// This makes sense for pseudo file systems that should not be visible or
-// mountable. See whitelistfs in fs/host/fs.go for one example.
-func UnregisterFilesystem(name string) {
- filesystems.mu.Lock()
- defer filesystems.mu.Unlock()
-
- delete(filesystems.registered, name)
-}
-
// FindFilesystem returns a Filesystem registered at name or (nil, false) if name
// is not a file system type that can be found in /proc/filesystems.
func FindFilesystem(name string) (Filesystem, bool) {
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index bdba6efe5..d2dbff268 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -42,9 +42,10 @@
// Dirent.dirMu
// Dirent.mu
// DirentCache.mu
-// Locks in InodeOperations implementations or overlayEntry
// Inode.Watches.mu (see `Inotify` for other lock ordering)
// MountSource.mu
+// Inode.appendMu
+// Locks in InodeOperations implementations or overlayEntry
//
// If multiple Dirent or MountSource locks must be taken, locks in the parent must be
// taken before locks in their children.
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 789369220..5fb419bcd 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -8,7 +8,6 @@ go_template_instance(
out = "dirty_set_impl.go",
imports = {
"memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
},
package = "fsutil",
prefix = "Dirty",
@@ -25,14 +24,14 @@ go_template_instance(
name = "frame_ref_set_impl",
out = "frame_ref_set_impl.go",
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "fsutil",
prefix = "FrameRef",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "uint64",
"Functions": "FrameRefSetFunctions",
},
@@ -43,7 +42,6 @@ go_template_instance(
out = "file_range_set_impl.go",
imports = {
"memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
},
package = "fsutil",
prefix = "FileRange",
@@ -86,7 +84,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
- "//pkg/sentry/platform",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/state",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index c6cd45087..2c9446c1d 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
// repeatedly until all bytes have been written. max is the true size of the
// cached object; offsets beyond max will not be passed to writeAt, even if
// they are marked dirty.
-func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
var changedDirty bool
defer func() {
if changedDirty {
@@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
// successful partial write, SyncDirtyAll will call it repeatedly until all
// bytes have been written. max is the true size of the cached object; offsets
// beyond max will not be passed to writeAt, even if they are marked dirty.
-func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
dseg := dirty.FirstSegment()
for dseg.Ok() {
if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
@@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max
}
// Preconditions: mr must be page-aligned.
-func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
wbr := cseg.Range().Intersect(mr)
if max < wbr.Start {
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 08695391c..dc9efa5df 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -31,7 +31,7 @@ import (
type FileNoopRelease struct{}
// Release is a no-op.
-func (FileNoopRelease) Release() {}
+func (FileNoopRelease) Release(context.Context) {}
// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor
// is not nil and the seek was on a directory, the cursor will be updated.
@@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren
func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 5643cdac9..bbafebf03 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -23,13 +23,12 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/usermem"
)
// FileRangeSet maps offsets into a memmap.Mappable to offsets into a
-// platform.File. It is used to implement Mappables that store data in
+// memmap.File. It is used to implement Mappables that store data in
// sparsely-allocated memory.
//
// type FileRangeSet <generated by go_generics>
@@ -65,20 +64,20 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli
}
// FileRange returns the FileRange mapped by seg.
-func (seg FileRangeIterator) FileRange() platform.FileRange {
+func (seg FileRangeIterator) FileRange() memmap.FileRange {
return seg.FileRangeOf(seg.Range())
}
// FileRangeOf returns the FileRange mapped by mr.
//
// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
-func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange {
+func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
frstart := seg.Value() + (mr.Start - seg.Start())
- return platform.FileRange{frstart, frstart + mr.Length()}
+ return memmap.FileRange{frstart, frstart + mr.Length()}
}
// Fill attempts to ensure that all memmap.Mappable offsets in required are
-// mapped to a platform.File offset, by allocating from mf with the given
+// mapped to a memmap.File offset, by allocating from mf with the given
// memory usage kind and invoking readAt to store data into memory. (If readAt
// returns a successful partial read, Fill will call it repeatedly until all
// bytes have been read.) EOF is handled consistently with the requirements of
@@ -141,7 +140,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
}
// Drop removes segments for memmap.Mappable offsets in mr, freeing the
-// corresponding platform.FileRanges.
+// corresponding memmap.FileRanges.
//
// Preconditions: mr must be page-aligned.
func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
@@ -154,7 +153,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
}
// DropAll removes all segments in mr, freeing the corresponding
-// platform.FileRanges.
+// memmap.FileRanges.
func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
mf.DecRef(seg.FileRange())
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index dd6f5aba6..a808894df 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -17,7 +17,7 @@ package fsutil
import (
"math"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usage"
)
@@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) {
}
// Merge implements segment.Functions.Merge.
-func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) {
if val1 != val2 {
return 0, false
}
@@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
}
// Split implements segment.Functions.Split.
-func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) {
return val, val
}
// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
// are accounted as host page cache memory mappings.
-func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) {
seg, gap := refs.Find(fr.Start)
for {
switch {
@@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
// DecRefAndAccount removes a reference on the range fr and untracks segments
// that are removed from memory accounting.
-func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) {
seg := refs.FindSegment(fr.Start)
for seg.Ok() && seg.Start() < fr.End {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index e82afd112..ef0113b52 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -126,7 +125,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
// offsets in fr or until the next call to UnmapAll.
//
// Preconditions: The caller must hold a reference on all offsets in fr.
-func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
+func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
@@ -146,7 +145,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool)
}
// Preconditions: f.mapsMu must be locked.
-func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error {
+func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error {
prot := syscall.PROT_READ
if write {
prot |= syscall.PROT_WRITE
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 78fec553e..c15d8a946 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -21,18 +21,17 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
-// HostMappable implements memmap.Mappable and platform.File over a
+// HostMappable implements memmap.Mappable and memmap.File over a
// CachedFileObject.
//
// Lock order (compare the lock order model in mm/mm.go):
// truncateMu ("fs locks")
// mu ("memmap.Mappable locks not taken by Translate")
-// ("platform.File locks")
+// ("memmap.File locks")
// backingFile ("CachedFileObject locks")
//
// +stateify savable
@@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error {
return nil
}
-// MapInternal implements platform.File.MapInternal.
-func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (h *HostMappable) FD() int {
return h.backingFile.FD()
}
-// IncRef implements platform.File.IncRef.
-func (h *HostMappable) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (h *HostMappable) IncRef(fr memmap.FileRange) {
mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
h.hostFileMapper.IncRefOn(mr)
}
-// DecRef implements platform.File.DecRef.
-func (h *HostMappable) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (h *HostMappable) DecRef(fr memmap.FileRange) {
mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
h.hostFileMapper.DecRefOn(mr)
}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 800c8b4e1..fe8b0b6ac 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -26,7 +26,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -934,7 +933,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
- // Whether we have a host fd (and consequently what platform.File is
+ // Whether we have a host fd (and consequently what memmap.File is
// mapped) can change across save/restore, so invalidate all translations
// unconditionally.
c.mapsMu.Lock()
@@ -999,10 +998,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable
}
}
-// IncRef implements platform.File.IncRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// IncRef implements memmap.File.IncRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
-func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) {
// Hot path. Avoid defers.
c.dataMu.Lock()
seg, gap := c.refs.Find(fr.Start)
@@ -1024,10 +1023,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
}
}
-// DecRef implements platform.File.DecRef. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// DecRef implements memmap.File.DecRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
-func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) {
// Hot path. Avoid defers.
c.dataMu.Lock()
seg := c.refs.FindSegment(fr.Start)
@@ -1046,15 +1045,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
c.dataMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal. This is used when we
+// MapInternal implements memmap.File.MapInternal. This is used when we
// directly map an underlying host fd and CachingInodeOperations is used as the
-// platform.File during translation.
-func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// memmap.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
}
-// FD implements platform.File.FD. This is used when we directly map an
-// underlying host fd and CachingInodeOperations is used as the platform.File
+// FD implements memmap.File.FD. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the memmap.File
// during translation.
func (c *CachingInodeOperations) FD() int {
return c.backingFile.FD()
diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
new file mode 100644
index 000000000..2ca84dd74
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -0,0 +1,263 @@
+# Foreword
+
+This document describes an on-going project to support FUSE filesystems within
+the sentry. This is intended to become the final documentation for this
+subsystem, and is therefore written in the past tense. However FUSE support is
+currently incomplete and the document will be updated as things progress.
+
+# FUSE: Filesystem in Userspace
+
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
+
+## Overview
+
+FUSE has two main components:
+
+1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+ filesystem operations (usually initiated by syscalls) to the server.
+
+2. A server, which is a userspace daemon that implements the actual filesystem.
+
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
+
+A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
+utility like `fusermount(1)`. Various mount options exist for establishing
+ownership and access permissions on the filesystem, but the most important mount
+option is a file descriptor used to establish communication between the client
+and server.
+
+The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
+the client and server use the FUSE protocol described in `fuse(4)` to service
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
+
+## FUSE in the Sentry
+
+The sentry's FUSE client targets VFS2 and has the following components:
+
+- An implementation of `/dev/fuse`.
+
+- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+ VFS2, one point of contention may be the lack of inodes in VFS2. We can
+ tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+ The kernfs base functionality can serve the role of the Linux inode cache
+ and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+ the `kernfs.Inode` interface.
+
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
+
+### Design Goals
+
+- While filesystem performance is always important, the sentry's FUSE support
+ is primarily concerned with compatibility, with performance as a secondary
+ concern.
+
+- Avoiding deadlocks from a hung server daemon.
+
+- Consider the potential for denial of service from a malicious server daemon.
+ Protecting itself from userspace is already a design goal for the sentry,
+ but needs additional consideration for FUSE. Normally, an operating system
+ doesn't rely on userspace to make progress with filesystem operations. Since
+ this changes with FUSE, it opens up the possibility of creating a chain of
+ dependencies controlled by userspace, which could affect an entire sandbox.
+ For example: a FUSE op can block a syscall, which could be holding a
+ subsystem lock, which can then block another task goroutine.
+
+### Milestones
+
+Below are some broad goals to aim for while implementing FUSE in the sentry.
+Many FUSE ops can be grouped into broad categories of functionality, and most
+ops can be implemented in parallel.
+
+#### Minimal client that can mount a trivial FUSE filesystem.
+
+- Implement `/dev/fuse` - a character device used to establish an FD for
+ communication between the sentry and the server daemon.
+
+- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+
+#### Read-only mount with basic file operations
+
+- Implement the majority of file, directory and file descriptor FUSE ops. For
+ this milestone, we can skip uncommon or complex operations like mmap, mknod,
+ file locking, poll, and extended attributes. We can stub these out along
+ with any ops that modify the filesystem. The exact list of required ops are
+ to be determined, but the goal is to mount a real filesystem as read-only,
+ and be able to read contents from the filesystem in the sentry.
+
+#### Full read-write support
+
+- Implement the remaining FUSE ops and decide if we can omit rarely used
+ operations like ioctl.
+
+# Appendix
+
+## FUSE Protocol
+
+The FUSE protocol is a request-response protocol. All requests are initiated by
+the client. The wire-format for the protocol is raw C structs serialized to
+memory.
+
+All FUSE requests begin with the following request header:
+
+```c
+struct fuse_in_header {
+ uint32_t len; // Length of the request, including this header.
+ uint32_t opcode; // Requested operation.
+ uint64_t unique; // A unique identifier for this request.
+ uint64_t nodeid; // ID of the filesystem object being operated on.
+ uint32_t uid; // UID of the requesting process.
+ uint32_t gid; // GID of the requesting process.
+ uint32_t pid; // PID of the requesting process.
+ uint32_t padding;
+};
+```
+
+The request is then followed by a payload specific to the `opcode`.
+
+All responses begin with this response header:
+
+```c
+struct fuse_out_header {
+ uint32_t len; // Length of the response, including this header.
+ int32_t error; // Status of the request, 0 if success.
+ uint64_t unique; // The unique identifier from the corresponding request.
+};
+```
+
+The response payload also depends on the request `opcode`. If `error != 0`, the
+response payload must be empty.
+
+### Operations
+
+The following is a list of all FUSE operations used in `fuse_in_header.opcode`
+as of Linux v4.4, and a brief description of their purpose. These are defined in
+`uapi/linux/fuse.h`. Many of these have a corresponding request and response
+payload struct; `fuse(4)` has details for some of these. We also note how these
+operations map to the sentry virtual filesystem.
+
+#### FUSE meta-operations
+
+These operations are specific to FUSE and don't have a corresponding action in a
+generic filesystem.
+
+- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+ first message sent by the client after mount. This is used for version and
+ feature negotiation. This is related to `mount(2)`.
+- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+ `fuse_in_header.unique` value provided in the corresponding request header.
+ The client can send at most one of these per request, and will enter an
+ uninterruptible wait for a reply. The server is expected to reply promptly.
+- `FUSE_FORGET`: A hint to the server that server should evict the indicate
+ node from any caches. This is wired up to `(struct
+ super_operations).evict_inode` in Linux, which is in turned hooked as the
+ inode cache shrinker which is typically triggered by system memory pressure.
+- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+
+#### Filesystem Syscalls
+
+These FUSE ops map directly to an equivalent filesystem syscall, or family of
+syscalls. The relevant syscalls have a similar name to the operation, unless
+otherwise noted.
+
+Node creation:
+
+- `FUSE_MKNOD`
+- `FUSE_MKDIR`
+- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+ atomically creates and opens a node.
+
+Node attributes and extended attributes:
+
+- `FUSE_GETATTR`
+- `FUSE_SETATTR`
+- `FUSE_SETXATTR`
+- `FUSE_GETXATTR`
+- `FUSE_LISTXATTR`
+- `FUSE_REMOVEXATTR`
+
+Node link manipulation:
+
+- `FUSE_READLINK`
+- `FUSE_LINK`
+- `FUSE_SYMLINK`
+- `FUSE_UNLINK`
+
+Directory operations:
+
+- `FUSE_RMDIR`
+- `FUSE_RENAME`
+- `FUSE_RENAME2`
+- `FUSE_OPENDIR`: `open(2)` for directories.
+- `FUSE_RELEASEDIR`: `close(2)` for directories.
+- `FUSE_READDIR`
+- `FUSE_READDIRPLUS`
+- `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+ reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+ component to a node. However the returned identifier is opaque to the
+ client. The server must remember this mapping, as this is how the client
+ will reference the node in the future.
+
+File operations:
+
+- `FUSE_OPEN`: `open(2)` for files.
+- `FUSE_RELEASE`: `close(2)` for files.
+- `FUSE_FSYNC`
+- `FUSE_FALLOCATE`
+- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+
+File locking:
+
+- `FUSE_GETLK`
+- `FUSE_SETLK`
+- `FUSE_SETLKW`
+- `FUSE_COPY_FILE_RANGE`
+
+File descriptor operations:
+
+- `FUSE_IOCTL`
+- `FUSE_POLL`
+- `FUSE_LSEEK`
+
+Filesystem operations:
+
+- `FUSE_STATFS`
+
+#### Permissions
+
+- `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+ syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+ sentry.
+
+#### I/O Operations
+
+These ops are used to read and write file pages. They're used to implement both
+I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
+
+- `FUSE_READ`
+- `FUSE_WRITE`
+
+#### Miscellaneous
+
+- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+ closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+ syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+ sentry.
+- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+
+# References
+
+- [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html)
+- [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html)
+- [The reference implementation of the Linux FUSE (Filesystem in Userspace)
+ interface](https://github.com/libfuse/libfuse)
+- [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h)
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index b2fcab127..c0bc63a32 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
}
// Release implements fs.FileOpeations.Release.
-func (f *fileOperations) Release() {
+func (f *fileOperations) Release(context.Context) {
f.handles.DecRef()
}
@@ -122,7 +122,7 @@ func (f *fileOperations) Release() {
func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 2df2fe889..326fed954 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) {
// We must release the dirent, of the test will fail
// with a reference leak. This is tracked by p9test.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
// Walk again. Depending on the cache policy, we may
// get a new dirent.
@@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) {
if !test.preModificationWantReload && dirent != newDirent {
t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent)
}
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
// Modify the underlying mocked file's modification
// time for the next walk that occurs.
@@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) {
if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds)
}
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
// Remove the file from the remote fs, subsequent walks
// should now fail to find anything.
@@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) {
t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err)
}
if err == nil {
- newDirent.DecRef() // See above.
+ newDirent.DecRef(ctx) // See above.
}
})
}
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index fc14249be..f324dbf26 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -47,7 +47,8 @@ type handles struct {
// DecRef drops a reference on handles.
func (h *handles) DecRef() {
- h.DecRefWithDestructor(func() {
+ ctx := context.Background()
+ h.DecRefWithDestructor(ctx, func(context.Context) {
if h.Host != nil {
if h.isHostBorrowed {
h.Host.Release()
@@ -57,7 +58,7 @@ func (h *handles) DecRef() {
}
}
}
- if err := h.File.close(context.Background()); err != nil {
+ if err := h.File.close(ctx); err != nil {
log.Warningf("error closing p9 file: %v", err)
}
})
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index a016c896e..3a225fd39 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) {
// asynchronously.
//
// We use AsyncWithContext to avoid needing to allocate an extra
- // anonymous function on the heap.
- fs.AsyncWithContext(ctx, i.fileState.Release)
+ // anonymous function on the heap. We must use background context
+ // because the async work cannot happen on the task context.
+ fs.AsyncWithContext(context.Background(), i.fileState.Release)
}
// Mappable implements fs.InodeOperations.Mappable.
@@ -640,7 +641,7 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
- if !i.session().cachePolicy.cacheUAttrs(inode) {
+ if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) {
return nil
}
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index cf9800100..3c66dc3c2 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
// Construct the positive Dirent.
d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
- defer d.DecRef()
+ defer d.DecRef(ctx)
// Construct the new file, caching the handles if allowed.
h := handles{
@@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
// Find out if file being deleted is a socket or pipe that needs to be
// removed from endpoint map.
if d, err := i.Lookup(ctx, dir, name); err == nil {
- defer d.DecRef()
+ defer d.DecRef(ctx)
if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) {
switch iops := d.Inode.InodeOperations.(type) {
@@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
return err
}
if key != nil {
- i.session().overrides.remove(*key)
+ i.session().overrides.remove(ctx, *key)
}
i.touchModificationAndStatusChangeTime(ctx, dir)
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index b5efc86f2..7cf3522ff 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f
// remove deletes the key from the maps.
//
// Precondition: maps must have been locked with 'lock'.
-func (e *overrideMaps) remove(key device.MultiDeviceKey) {
+func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) {
endpoint := e.keyMap[key]
delete(e.keyMap, key)
- endpoint.dirent.DecRef()
+ endpoint.dirent.DecRef(ctx)
}
// lock blocks other addition and removal operations from happening while
@@ -197,7 +197,7 @@ type session struct {
}
// Destroy tears down the session.
-func (s *session) Destroy() {
+func (s *session) Destroy(ctx context.Context) {
s.client.Close()
}
@@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
s.client, err = p9.NewClient(conn, s.msize, s.version)
if err != nil {
// Drop our reference on the session, it needs to be torn down.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
ctx.UninterruptibleSleepFinish(false)
if err != nil {
// Same as above.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
if err != nil {
s.attach.close(ctx)
// Same as above, but after we execute the Close request.
- s.DecRef()
+ s.DecRef(ctx)
return nil, err
}
@@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error {
// fillPathMap populates paths for overrides from dirents in direntMap
// before save.
-func (s *session) fillPathMap() error {
+func (s *session) fillPathMap(ctx context.Context) error {
unlock := s.overrides.lock()
defer unlock()
for _, endpoint := range s.overrides.keyMap {
mountRoot := endpoint.dirent.MountRoot()
- defer mountRoot.DecRef()
+ defer mountRoot.DecRef(ctx)
dirPath, _ := endpoint.dirent.FullName(mountRoot)
if dirPath == "" {
return fmt.Errorf("error getting path from dirent")
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 2d398b753..48b423dd8 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -26,7 +26,8 @@ import (
// beforeSave is invoked by stateify.
func (s *session) beforeSave() {
if s.overrides != nil {
- if err := s.fillPathMap(); err != nil {
+ ctx := &dummyClockContext{context.Background()}
+ if err := s.fillPathMap(ctx); err != nil {
panic("failed to save paths to override map before saving" + err.Error())
}
}
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 40f2c1cad..8a1c69ac2 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
// We don't need the receiver.
c.CloseRecv()
- c.Release()
+ c.Release(ctx)
return c, nil
}
// Release implements transport.BoundEndpoint.Release.
-func (e *endpoint) Release() {
- e.inode.DecRef()
+func (e *endpoint) Release(ctx context.Context) {
+ e.inode.DecRef(ctx)
}
// Passcred implements transport.BoundEndpoint.Passcred.
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index aabce6cc9..d41d23a43 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -30,6 +30,7 @@ go_library(
"//pkg/context",
"//pkg/fd",
"//pkg/fdnotifier",
+ "//pkg/iovec",
"//pkg/log",
"//pkg/refs",
"//pkg/safemem",
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 39299b7e4..0d8d36afa 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (c *scmRights) Release() {
+func (c *scmRights) Release(ctx context.Context) {
for _, fd := range c.fds {
syscall.Close(fd)
}
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 3e48b8b2c..86d1a87f0 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool
name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
dirent := fs.NewDirent(ctx, inode, name)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
if isTTY {
return newTTYFile(ctx, dirent, flags, iops), nil
@@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 62f1246aa..fbfba1b58 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -368,6 +368,9 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+ if inode.MountSource.Flags.ReadOnly {
+ return nil
+ }
// Have we been using host kernel metadata caches?
if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
// Then the metadata is already up to date on the host.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index c507f57eb..41a23b5da 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) {
if err != nil {
t.Fatalf("Failed to create File: %v", err)
}
- file.DecRef()
+ file.DecRef(ctx)
s := make([]byte, 10)
if c, err := syscall.Read(p[0], s); c != 0 || err != nil {
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index b6e94583e..a2f3d5918 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/fdnotifier"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
@@ -39,11 +38,6 @@ import (
// LINT.IfChange
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
// ConnectedEndpoint is a host FD backed implementation of
// transport.ConnectedEndpoint and transport.Receiver.
//
@@ -103,10 +97,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
if err != nil {
return syserr.FromError(err)
}
- if sndbuf > maxSendBufferSize {
- log.Warningf("Socket send buffer too large: %d", sndbuf)
- return syserr.ErrInvalidEndpointState
- }
c.stype = linux.SockType(stype)
c.sndbuf = int64(sndbuf)
@@ -204,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
}
// Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -281,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
}
// Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -328,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek
}
// close releases all resources related to the endpoint.
-func (c *ConnectedEndpoint) close() {
+func (c *ConnectedEndpoint) close(context.Context) {
fdnotifier.RemoveFD(int32(c.file.FD()))
c.file.Close()
c.file = nil
@@ -384,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
}
// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release.
-func (c *ConnectedEndpoint) Release() {
- c.ref.DecRefWithDestructor(c.close)
+func (c *ConnectedEndpoint) Release(ctx context.Context) {
+ c.ref.DecRefWithDestructor(ctx, c.close)
}
// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 5c18dbd5e..905afb50d 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -17,15 +17,12 @@ package host
import (
"syscall"
- "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/iovec"
"gvisor.dev/gvisor/pkg/syserror"
)
// LINT.IfChange
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
// copyToMulti copies as many bytes from src to dst as possible.
func copyToMulti(dst [][]byte, src []byte) {
for _, d := range dst {
@@ -76,7 +73,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
}
}
- if iovsRequired > maxIovs {
+ if iovsRequired > iovec.MaxIovs {
// The kernel will reject our call if we pass this many iovs.
// Use a single intermediate buffer instead.
b := make([]byte, stopLen)
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index affdbcacb..9d58ea448 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) {
if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
t.Fatalf("Expected socket %v to be blocking", pair[1])
}
- sock, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sock, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
}
- defer sock.DecRef()
+ defer sock.DecRef(ctx)
// Test that the socket now is non-blocking.
if fl, err = getFl(pair[0]); err != nil {
t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
@@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ socket, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer socket.DecRef()
+ defer socket.DecRef(ctx)
buf := []byte("hello world\n")
n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf))
if err != nil {
@@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ socket, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer socket.DecRef()
+ defer socket.DecRef(ctx)
n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil))
if err != nil {
t.Fatalf("socket writev failed: %v", err)
@@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) {
if err != nil {
t.Fatalf("host socket creation failed: %v", err)
}
- sfile, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sfile, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer sfile.DecRef()
+ defer sfile.DecRef(ctx)
s := sfile.FileOperations.(socket.Socket)
n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{})
@@ -158,18 +162,19 @@ func TestListen(t *testing.T) {
if err != nil {
t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
}
- sfile1, err := newSocket(contexttest.Context(t), pair[0], false)
+ ctx := contexttest.Context(t)
+ sfile1, err := newSocket(ctx, pair[0], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[0], err)
}
- defer sfile1.DecRef()
+ defer sfile1.DecRef(ctx)
socket1 := sfile1.FileOperations.(socket.Socket)
- sfile2, err := newSocket(contexttest.Context(t), pair[1], false)
+ sfile2, err := newSocket(ctx, pair[1], false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", pair[1], err)
}
- defer sfile2.DecRef()
+ defer sfile2.DecRef(ctx)
socket2 := sfile2.FileOperations.(socket.Socket)
// Socketpairs can not be listened to.
@@ -185,11 +190,11 @@ func TestListen(t *testing.T) {
if err != nil {
t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
}
- sfile3, err := newSocket(contexttest.Context(t), sock, false)
+ sfile3, err := newSocket(ctx, sock, false)
if err != nil {
t.Fatalf("newSocket(%v) => %v", sock, err)
}
- defer sfile3.DecRef()
+ defer sfile3.DecRef(ctx)
socket3 := sfile3.FileOperations.(socket.Socket)
// This socket is not bound so we can't listen on it.
@@ -237,9 +242,10 @@ func TestRelease(t *testing.T) {
}
c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
want := &ConnectedEndpoint{queue: c.queue}
- want.ref.DecRef()
+ ctx := contexttest.Context(t)
+ want.ref.DecRef(ctx)
fdnotifier.AddFD(int32(c.file.FD()), nil)
- c.Release()
+ c.Release(ctx)
if !reflect.DeepEqual(c, want) {
t.Errorf("got = %#v, want = %#v", c, want)
}
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index cb91355ab..b5229098c 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -113,12 +113,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme
}
// Release implements fs.FileOperations.Release.
-func (t *TTYFileOperations) Release() {
+func (t *TTYFileOperations) Release(ctx context.Context) {
t.mu.Lock()
t.fgProcessGroup = nil
t.mu.Unlock()
- t.fileOperations.Release()
+ t.fileOperations.Release(ctx)
}
// Ioctl implements fs.FileOperations.Ioctl.
@@ -308,9 +308,9 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
task := kernel.TaskFromContext(ctx)
if task == nil {
// No task? Linux does not have an analog for this case, but
- // tty_check_change is more of a blacklist of cases than a
- // whitelist, and is surprisingly permissive. Allowing the
- // change seems most appropriate.
+ // tty_check_change only blocks specific cases and is
+ // surprisingly permissive. Allowing the change seems
+ // appropriate.
return nil
}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index ce397a5e3..c143f4ce2 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -39,7 +39,7 @@ func TestWait(t *testing.T) {
t.Fatalf("NewFile failed: %v", err)
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
r := file.Readiness(waiter.EventIn)
if r != 0 {
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index a34fbc946..b79cd9877 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt
}
// DecRef drops a reference on the Inode.
-func (i *Inode) DecRef() {
- i.DecRefWithDestructor(i.destroy)
+func (i *Inode) DecRef(ctx context.Context) {
+ i.DecRefWithDestructor(ctx, i.destroy)
}
// destroy releases the Inode and releases the msrc reference taken.
-func (i *Inode) destroy() {
- ctx := context.Background()
+func (i *Inode) destroy(ctx context.Context) {
if err := i.WriteOut(ctx); err != nil {
// FIXME(b/65209558): Mark as warning again once noatime is
// properly supported.
@@ -122,12 +121,12 @@ func (i *Inode) destroy() {
i.Watches.targetDestroyed()
if i.overlay != nil {
- i.overlay.release()
+ i.overlay.release(ctx)
} else {
i.InodeOperations.Release(ctx)
}
- i.MountSource.DecRef()
+ i.MountSource.DecRef(ctx)
}
// Mappable calls i.InodeOperations.Mappable.
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index efd3c962b..9911a00c2 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -17,6 +17,7 @@ package fs
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) {
}
// Unpin unpins dirent from all watches in this set.
-func (w *Watches) Unpin(d *Dirent) {
+func (w *Watches) Unpin(ctx context.Context, d *Dirent) {
w.mu.RLock()
defer w.mu.RUnlock()
for _, watch := range w.ws {
- watch.Unpin(d)
+ watch.Unpin(ctx, d)
}
}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 537c8d257..dc2e353d9 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -85,7 +85,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
upperInode = child.Inode
upperInode.IncRef()
}
- child.DecRef()
+ child.DecRef(ctx)
}
// Are we done?
@@ -108,7 +108,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
entry, err := newOverlayEntry(ctx, upperInode, nil, false)
if err != nil {
// Don't leak resources.
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
parent.copyMu.RUnlock()
return nil, false, err
}
@@ -129,7 +129,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
if err != nil && err != syserror.ENOENT {
// Don't leak resources.
if upperInode != nil {
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
}
parent.copyMu.RUnlock()
return nil, false, err
@@ -152,7 +152,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
}
}
}
- child.DecRef()
+ child.DecRef(ctx)
}
}
@@ -183,7 +183,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
// unnecessary because we don't need to copy-up and we will always
// operate (e.g. read/write) on the upper Inode.
if !IsDir(upperInode.StableAttr) {
- lowerInode.DecRef()
+ lowerInode.DecRef(ctx)
lowerInode = nil
}
}
@@ -194,10 +194,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
// Well, not quite, we failed at the last moment, how depressing.
// Be sure not to leak resources.
if upperInode != nil {
- upperInode.DecRef()
+ upperInode.DecRef(ctx)
}
if lowerInode != nil {
- lowerInode.DecRef()
+ lowerInode.DecRef(ctx)
}
parent.copyMu.RUnlock()
return nil, false, err
@@ -248,7 +248,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
// user) will clobber the real path for the underlying Inode.
upperFile.Dirent.Inode.IncRef()
upperDirent := NewTransientDirent(upperFile.Dirent.Inode)
- upperFile.Dirent.DecRef()
+ upperFile.Dirent.DecRef(ctx)
upperFile.Dirent = upperDirent
// Create the overlay inode and dirent. We need this to construct the
@@ -259,7 +259,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
// The overlay file created below with NewFile will take a reference on
// the overlayDirent, and it should be the only thing holding a
// reference at the time of creation, so we must drop this reference.
- defer overlayDirent.DecRef()
+ defer overlayDirent.DecRef(ctx)
// Create a new overlay file that wraps the upper file.
flags.Pread = upperFile.Flags().Pread
@@ -399,7 +399,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
children, err := readdirOne(ctx, replaced)
if err != nil {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return err
}
@@ -407,12 +407,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
// included among the returned children, so we don't
// need to bother checking for them.
if len(children) > 0 {
- replaced.DecRef()
+ replaced.DecRef(ctx)
return syserror.ENOTEMPTY
}
}
- replaced.DecRef()
+ replaced.DecRef(ctx)
}
}
@@ -455,12 +455,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri
// Grab the inode and drop the dirent, we don't need it.
inode := d.Inode
inode.IncRef()
- d.DecRef()
+ d.DecRef(ctx)
// Create a new overlay entry and dirent for the socket.
entry, err := newOverlayEntry(ctx, inode, nil, false)
if err != nil {
- inode.DecRef()
+ inode.DecRef(ctx)
return nil, err
}
// Use the parent's MountSource, since that corresponds to the overlay,
@@ -672,7 +672,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) {
// ground and claim that jumping around the filesystem like this
// is not supported.
name, _ := dirent.FullName(nil)
- dirent.DecRef()
+ dirent.DecRef(ctx)
// Claim that the path is not accessible.
err = syserror.EACCES
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 389c219d6..aa9851b26 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
ctx = &rootContext{
Context: ctx,
@@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) {
}
// Drop the file reference.
- file.DecRef()
+ file.DecRef(ctx)
// Dirent should have 2 refs left.
if got, want := dirent.ReadRefs(), 2; int(got) != want {
@@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) {
}
// Drop our ref.
- dirent.DecRef()
+ dirent.DecRef(ctx)
// We should be back to zero refs.
if got, want := dirent.ReadRefs(), 0; int(got) != want {
@@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags
if err != nil {
return nil, err
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// Wrap the file's FileOperations in a dirFile.
fops := &dirFile{
FileOperations: file.FileOperations,
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index e3a715c1f..c5c07d564 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify {
// Release implements FileOperations.Release. Release removes all watches and
// frees all resources for an inotify instance.
-func (i *Inotify) Release() {
+func (i *Inotify) Release(ctx context.Context) {
// We need to hold i.mu to avoid a race with concurrent calls to
// Inotify.targetDestroyed from Watches. There's no risk of Watches
// accessing this Inotify after the destructor ends, because we remove all
@@ -93,7 +93,7 @@ func (i *Inotify) Release() {
// the owner's destructor.
w.target.Watches.Remove(w.ID())
// Don't leak any references to the target, held by pins in the watch.
- w.destroy()
+ w.destroy(ctx)
}
}
@@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 {
//
// RmWatch looks up an inotify watch for the given 'wd' and configures the
// target dirent to stop sending events to this inotify instance.
-func (i *Inotify) RmWatch(wd int32) error {
+func (i *Inotify) RmWatch(ctx context.Context, wd int32) error {
i.mu.Lock()
// Find the watch we were asked to removed.
@@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error {
i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
// Remove all pins.
- watch.destroy()
+ watch.destroy(ctx)
return nil
}
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 900cba3ca..605423d22 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) {
// Unpin drops any extra refs held on dirent due to a previous Pin
// call. Calling Unpin multiple times for the same dirent, or on a dirent
// without a corresponding Pin call is a no-op.
-func (w *Watch) Unpin(d *Dirent) {
+func (w *Watch) Unpin(ctx context.Context, d *Dirent) {
w.mu.Lock()
defer w.mu.Unlock()
if w.pins[d] {
delete(w.pins, d)
- d.DecRef()
+ d.DecRef(ctx)
}
}
@@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() {
// this watch. Destroy does not cause any new events to be generated. The caller
// is responsible for ensuring there are no outstanding references to this
// watch.
-func (w *Watch) destroy() {
+func (w *Watch) destroy(ctx context.Context) {
w.mu.Lock()
defer w.mu.Unlock()
for d := range w.pins {
- d.DecRef()
+ d.DecRef(ctx)
}
w.pins = nil
}
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 926538d90..8a5d9c7eb 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -62,7 +62,7 @@ import (
type LockType int
// UniqueID is a unique identifier of the holder of a regional file lock.
-type UniqueID uint64
+type UniqueID interface{}
const (
// ReadLock describes a POSIX regional file lock to be taken
@@ -98,12 +98,7 @@ type Lock struct {
// If len(Readers) > 0 then HasWriter must be false.
Readers map[UniqueID]bool
- // HasWriter indicates that this is a write lock held by a single
- // UniqueID.
- HasWriter bool
-
- // Writer is only valid if HasWriter is true. It identifies a
- // single write lock holder.
+ // Writer holds the writer unique ID. It's nil if there are no writers.
Writer UniqueID
}
@@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock {
case ReadLock:
value.Readers[uid] = true
case WriteLock:
- value.HasWriter = true
value.Writer = uid
default:
panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
@@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock {
// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
- if l.HasWriter && l.Writer == uid {
- return true
- }
- return l.Readers[uid]
+ return l.Writer == uid || l.Readers[uid]
}
// lock sets uid as a holder of a typed lock on Lock.
@@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
}
// We cannot downgrade a write lock to a read lock unless the
// uid is the same.
- if l.HasWriter {
+ if l.Writer != nil {
if l.Writer != uid {
panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
}
// Ensure that there is only one reader if upgrading.
l.Readers = make(map[UniqueID]bool)
// Ensure that there is no longer a writer.
- l.HasWriter = false
+ l.Writer = nil
}
l.Readers[uid] = true
return
case WriteLock:
// If we are already the writer, then this is a no-op.
- if l.HasWriter && l.Writer == uid {
+ if l.Writer == uid {
return
}
// We can only upgrade a read lock to a write lock if there
@@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
}
// Ensure that there is only a writer.
l.Readers = make(map[UniqueID]bool)
- l.HasWriter = true
l.Writer = uid
default:
panic(fmt.Sprintf("lock: invalid lock type %d", t))
@@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
switch t {
case ReadLock:
return l.lockable(r, func(value Lock) bool {
- // If there is no writer, there's no problem adding
- // another reader.
- if !value.HasWriter {
+ // If there is no writer, there's no problem adding another reader.
+ if value.Writer == nil {
return true
}
// If there is a writer, then it must be the same uid
@@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
case WriteLock:
return l.lockable(r, func(value Lock) bool {
// If there are only readers.
- if !value.HasWriter {
- // Then this uid can only take a write lock if
- // this is a private upgrade, meaning that the
- // only reader is uid.
+ if value.Writer == nil {
+ // Then this uid can only take a write lock if this is a private
+ // upgrade, meaning that the only reader is uid.
return len(value.Readers) == 1 && value.Readers[uid]
}
// If the uid is already a writer on this region, then
@@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
}
}
-// lock returns true if uid took a lock of type t on the entire range of LockRange.
+// lock returns true if uid took a lock of type t on the entire range of
+// LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
@@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
seg, _ = l.SplitUnchecked(seg, r.End)
}
- // Set the lock on the segment. This is guaranteed to
+ // Set the lock on the segment. This is guaranteed to
// always be safe, given canLock above.
value := seg.ValuePtr()
value.lock(uid, t)
@@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) {
value := seg.Value()
var remove bool
- if value.HasWriter && value.Writer == uid {
+ if value.Writer == uid {
// If we are unlocking a writer, then since there can
// only ever be one writer and no readers, then this
// lock should always be removed from the set.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 8a3ace0c1..50a16e662 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock)
return Lock{}, false
}
}
- if val1.HasWriter != val2.HasWriter {
+ if val1.Writer != val2.Writer {
return Lock{}, false
}
- if val1.HasWriter {
- if val1.Writer != val2.Writer {
- return Lock{}, false
- }
- }
return val1, true
}
@@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock)
for k, v := range val.Readers {
val0.Readers[k] = v
}
- val0.HasWriter = val.HasWriter
val0.Writer = val.Writer
return val, val0
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index ba002aeb7..fad90984b 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool {
if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) {
return false
}
- if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter {
- return false
- }
if e0[i].Lock.Writer != e1[i].Lock.Writer {
return false
}
@@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) {
LockRange: LockRange{2048, 3072},
},
{
- Lock: Lock{HasWriter: true, Writer: 1},
+ Lock: Lock{Writer: 1},
LockRange: LockRange{3072, 4096},
},
})
@@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 4096},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{4096, LockEOF},
},
},
@@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) {
// 0 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 4096},
},
{
@@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 4096},
},
{
@@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 3072},
},
{
@@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) {
// 0 1024 2048 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) {
// 0 1024 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, LockEOF},
},
},
@@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) {
}
for _, test := range tests {
- l := fill(test.before)
+ t.Run(test.name, func(t *testing.T) {
+ l := fill(test.before)
- r := LockRange{Start: test.start, End: test.end}
- success := l.lock(test.uid, test.lockType, r)
- var got []entry
- for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- got = append(got, entry{
- Lock: seg.Value(),
- LockRange: seg.Range(),
- })
- }
+ r := LockRange{Start: test.start, End: test.end}
+ success := l.lock(test.uid, test.lockType, r)
+ var got []entry
+ for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ got = append(got, entry{
+ Lock: seg.Value(),
+ LockRange: seg.Range(),
+ })
+ }
- if success != test.success {
- t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success)
- continue
- }
+ if success != test.success {
+ t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success)
+ return
+ }
- if success {
- if !equals(got, test.after) {
- t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+ if success {
+ if !equals(got, test.after) {
+ t.Errorf("got set %+v, want %+v", got, test.after)
+ }
}
- }
+ })
}
}
@@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) {
// 0 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{4096, LockEOF},
},
},
@@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) {
// 0 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, LockEOF},
},
},
@@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) {
// 0 4096
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 4096},
},
},
@@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) {
LockRange: LockRange{0, 1024},
},
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{1024, 4096},
},
{
@@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) {
// 0 8 4096 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 8},
},
{
@@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 max uint64
before: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) {
// 0 1024 4096 8192 max uint64
after: []entry{
{
- Lock: Lock{HasWriter: true, Writer: 0},
+ Lock: Lock{Writer: 0},
LockRange: LockRange{0, 1024},
},
{
@@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) {
}
for _, test := range tests {
- l := fill(test.before)
+ t.Run(test.name, func(t *testing.T) {
+ l := fill(test.before)
- r := LockRange{Start: test.start, End: test.end}
- l.unlock(test.uid, r)
- var got []entry
- for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
- got = append(got, entry{
- Lock: seg.Value(),
- LockRange: seg.Range(),
- })
- }
- if !equals(got, test.after) {
- t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
- }
+ r := LockRange{Start: test.start, End: test.end}
+ l.unlock(test.uid, r)
+ var got []entry
+ for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ got = append(got, entry{
+ Lock: seg.Value(),
+ LockRange: seg.Range(),
+ })
+ }
+ if !equals(got, test.after) {
+ t.Errorf("got set %+v, want %+v", got, test.after)
+ }
+ })
}
}
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 37bae6810..ee69b10e8 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -51,7 +51,7 @@ type MountSourceOperations interface {
DirentOperations
// Destroy destroys the MountSource.
- Destroy()
+ Destroy(ctx context.Context)
// Below are MountSourceOperations that do not conform to Linux.
@@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() {
}
}
-func (msrc *MountSource) destroy() {
+func (msrc *MountSource) destroy(ctx context.Context) {
if c := msrc.DirentRefs(); c != 0 {
panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c))
}
- msrc.MountSourceOperations.Destroy()
+ msrc.MountSourceOperations.Destroy(ctx)
}
// DecRef drops a reference on the MountSource.
-func (msrc *MountSource) DecRef() {
- msrc.DecRefWithDestructor(msrc.destroy)
+func (msrc *MountSource) DecRef(ctx context.Context) {
+ msrc.DecRefWithDestructor(ctx, msrc.destroy)
}
// FlushDirentRefs drops all references held by the MountSource on Dirents.
@@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {}
func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {}
// Destroy implements MountSourceOperations.Destroy.
-func (*SimpleMountSourceOperations) Destroy() {}
+func (*SimpleMountSourceOperations) Destroy(context.Context) {}
// Info defines attributes of a filesystem.
type Info struct {
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 78e35b1e6..7badc75d6 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin
}
// Destroy drops references on the upper and lower MountSource.
-func (o *overlayMountSourceOperations) Destroy() {
- o.upper.DecRef()
- o.lower.DecRef()
+func (o *overlayMountSourceOperations) Destroy(ctx context.Context) {
+ o.upper.DecRef(ctx)
+ o.lower.DecRef(ctx)
}
// type overlayFilesystem is the filesystem for overlay mounts.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index a3d10770b..6c296f5d0 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -18,6 +18,7 @@ import (
"fmt"
"testing"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
)
@@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
return false
}
-func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
+func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error {
gotPaths := make(map[string]struct{}, len(got))
gotStr := make([]string, len(got))
for i, g := range got {
if groot := g.Root(); groot != nil {
name, _ := groot.FullName(root)
- groot.DecRef()
+ groot.DecRef(ctx)
gotStr[i] = name
gotPaths[name] = struct{}{}
}
@@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Get a child of the root which we will mount over. Note that the
// MockInodeOperations causes Walk to always succeed.
@@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Add mounts at the following paths:
paths := []string{
@@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) {
if err := mm.Mount(ctx, d, submountInode); err != nil {
t.Fatalf("could not mount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
}
// mm root should contain all submounts (and does not include the root mount).
rootMnt := mm.FindMount(rootDirent)
submounts := mm.AllMountsUnder(rootMnt)
allPaths := append(paths, "/")
- if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil {
t.Error(err)
}
@@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) {
if err != nil {
t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
submounts = mm.AllMountsUnder(mm.FindMount(d))
- if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
t.Error(err)
}
@@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) {
if err != nil {
t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
}
- defer waldo.DecRef()
+ defer waldo.DecRef(ctx)
submounts = mm.AllMountsUnder(mm.FindMount(waldo))
- if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil {
t.Error(err)
}
}
@@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) {
t.Fatalf("NewMountNamespace failed: %v", err)
}
rootDirent := mm.Root()
- defer rootDirent.DecRef()
+ defer rootDirent.DecRef(ctx)
// Add mounts at the following paths:
paths := []string{
@@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) {
if err := mm.Mount(ctx, d, submountInode); err != nil {
t.Fatalf("could not mount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
}
allPaths := make([]string, len(paths)+1)
@@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) {
if err := mm.Unmount(ctx, d, false); err != nil {
t.Fatalf("could not unmount at %q: %v", p, err)
}
- d.DecRef()
+ d.DecRef(ctx)
// Remove the path that has been unmounted and the check that the remaining
// mounts are still there.
allPaths = allPaths[:len(allPaths)-1]
submounts := mm.AllMountsUnder(rootMnt)
- if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+ if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil {
t.Error(err)
}
}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index b414ddaee..d741c4339 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -17,13 +17,9 @@ package fs
import (
"fmt"
"math"
- "path"
- "strings"
"syscall"
- "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sync"
@@ -238,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
// After destroy is called, the MountNamespace may continue to be referenced (for
// example via /proc/mounts), but should free all resources and shouldn't have
// Find* methods called.
-func (mns *MountNamespace) destroy() {
+func (mns *MountNamespace) destroy(ctx context.Context) {
mns.mu.Lock()
defer mns.mu.Unlock()
@@ -251,13 +247,13 @@ func (mns *MountNamespace) destroy() {
for _, mp := range mns.mounts {
// Drop the mount reference on all mounted dirents.
for ; mp != nil; mp = mp.previous {
- mp.root.DecRef()
+ mp.root.DecRef(ctx)
}
}
mns.mounts = nil
// Drop reference on the root.
- mns.root.DecRef()
+ mns.root.DecRef(ctx)
// Ensure that root cannot be accessed via this MountNamespace any
// more.
@@ -269,8 +265,8 @@ func (mns *MountNamespace) destroy() {
}
// DecRef implements RefCounter.DecRef with destructor mns.destroy.
-func (mns *MountNamespace) DecRef() {
- mns.DecRefWithDestructor(mns.destroy)
+func (mns *MountNamespace) DecRef(ctx context.Context) {
+ mns.DecRefWithDestructor(ctx, mns.destroy)
}
// withMountLocked prevents further walks to `node`, because `node` is about to
@@ -316,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode
if err != nil {
return err
}
- defer replacement.DecRef()
+ defer replacement.DecRef(ctx)
// Set the mount's root dirent and id.
parentMnt := mns.findMountLocked(mountPoint)
@@ -398,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev))
}
// Drop mount reference taken at the end of MountNamespace.Mount.
- prev.root.DecRef()
+ prev.root.DecRef(ctx)
} else {
mns.mounts[prev.root] = prev
}
@@ -500,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
// non-directory root is hopeless.
if current != root {
if !IsDir(current.Inode.StableAttr) {
- current.DecRef() // Drop reference from above.
+ current.DecRef(ctx) // Drop reference from above.
return nil, syserror.ENOTDIR
}
if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
- current.DecRef() // Drop reference from above.
+ current.DecRef(ctx) // Drop reference from above.
return nil, err
}
}
@@ -515,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
// Allow failed walks to cache the dirent, because no
// children will acquire a reference at the end.
current.maybeExtendReference()
- current.DecRef()
+ current.DecRef(ctx)
return nil, err
}
// Drop old reference.
- current.DecRef()
+ current.DecRef(ctx)
if remainder != "" {
// Ensure it's resolved, unless it's the last level.
@@ -574,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
case nil:
// Make sure we didn't exhaust the traversal budget.
if *remainingTraversals == 0 {
- target.DecRef()
+ target.DecRef(ctx)
return nil, syscall.ELOOP
}
- node.DecRef() // Drop the original reference.
+ node.DecRef(ctx) // Drop the original reference.
return target, nil
case syscall.ENOLINK:
@@ -586,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
return node, nil
case ErrResolveViaReadlink:
- defer node.DecRef() // See above.
+ defer node.DecRef(ctx) // See above.
// First, check if we should traverse.
if *remainingTraversals == 0 {
@@ -612,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema
return d, err
default:
- node.DecRef() // Drop for err; see above.
+ node.DecRef(ctx) // Drop for err; see above.
// Propagate the error.
return nil, err
@@ -625,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
defer mns.mu.Unlock()
mns.root.SyncAll(ctx)
}
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
- // Absolute paths can be used directly.
- if path.IsAbs(name) {
- return name, nil
- }
-
- // Paths with '/' in them should be joined to the working directory, or
- // to the root if working directory is not set.
- if strings.IndexByte(name, '/') > 0 {
- if wd == "" {
- wd = "/"
- }
- if !path.IsAbs(wd) {
- return "", fmt.Errorf("working directory %q must be absolute", wd)
- }
- return path.Join(wd, name), nil
- }
-
- // Otherwise, We must lookup the name in the paths, starting from the
- // calling context's root directory.
- root := RootFromContext(ctx)
- if root == nil {
- // Caller has no root. Don't bother traversing anything.
- return "", syserror.ENOENT
- }
- defer root.DecRef()
- for _, p := range paths {
- binPath := path.Join(p, name)
- traversals := uint(linux.MaxSymlinkTraversals)
- d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
- if err == syserror.ENOENT || err == syserror.EACCES {
- // Didn't find it here.
- continue
- }
- if err != nil {
- return "", err
- }
- defer d.DecRef()
-
- // Check that it is a regular file.
- if !IsRegular(d.Inode.StableAttr) {
- continue
- }
-
- // Check whether we can read and execute the found file.
- if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
- log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
- continue
- }
- return path.Join("/", p, name), nil
- }
- return "", syserror.ENOENT
-}
-
-// GetPath returns the PATH as a slice of strings given the environment
-// variables.
-func GetPath(env []string) []string {
- const prefix = "PATH="
- for _, e := range env {
- if strings.HasPrefix(e, prefix) {
- return strings.Split(strings.TrimPrefix(e, prefix), ":")
- }
- }
- return nil
-}
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index a69b41468..975d6cbc9 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) {
}
root := mm.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
foo, err := root.Walk(ctx, root, "foo")
if err != nil {
t.Fatalf("Error walking to foo: %v", err)
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index a8ae7d81d..35013a21b 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -107,7 +107,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags)
overlay, err := newOverlayEntry(ctx, upper, lower, true)
if err != nil {
- msrc.DecRef()
+ msrc.DecRef(ctx)
return nil, err
}
@@ -130,7 +130,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode,
msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags)
overlay, err := newOverlayEntry(ctx, nil, lower, true)
if err != nil {
- msrc.DecRef()
+ msrc.DecRef(ctx)
return nil, err
}
return newOverlayInode(ctx, overlay, msrc), nil
@@ -230,16 +230,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist
}, nil
}
-func (o *overlayEntry) release() {
+func (o *overlayEntry) release(ctx context.Context) {
// We drop a reference on upper and lower file system Inodes
// rather than releasing them, because in-memory filesystems
// may hold an extra reference to these Inodes so that they
// stay in memory.
if o.upper != nil {
- o.upper.DecRef()
+ o.upper.DecRef(ctx)
}
if o.lower != nil {
- o.lower.DecRef()
+ o.lower.DecRef(ctx)
}
}
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 35972e23c..45523adf8 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
// readDescriptors reads fds in the task starting at offset, and calls the
// toDentAttr callback for each to get a DentAttr, which it then emits. This is
// a helper for implementing fs.InodeOperations.Readdir.
-func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
+func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
var fds []int32
t.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.GetFDs()
+ fds = fdTable.GetFDs(ctx)
}
})
@@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error
func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
n, _ := f.file.Dirent.FullName(root)
return n, nil
@@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error {
func (f *fd) Release(ctx context.Context) {
f.Symlink.Release(ctx)
- f.file.DecRef()
-}
-
-// Close releases the reference on the file.
-func (f *fd) Close() error {
- f.file.DecRef()
- return nil
+ f.file.DecRef(ctx)
}
// fdDir is an InodeOperations for /proc/TID/fd.
@@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer
if f.isInfoFile {
typ = fs.Symlink
}
- return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
+ return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
return fs.GenericDentAttr(typ, device.ProcDevice)
})
}
@@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs
// locks, and other data. For now we only have flags.
// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
- file.DecRef()
+ file.DecRef(ctx)
contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags))
return newStaticProcInode(ctx, dir.MountSource, contents)
})
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 1fc9c703c..6a63c47b3 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
// The task has been destroyed. Nothing to show here.
return
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(t)
mnt := t.MountNamespace().FindMount(rootDir)
if mnt == nil {
@@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
continue // No longer valid.
}
mountPath, desc := mroot.FullName(rootDir)
- mroot.DecRef()
+ mroot.DecRef(t)
if !desc {
// MountSources that are not descendants of the chroot jail are ignored.
continue
@@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
if mroot == nil {
return // No longer valid.
}
- defer mroot.DecRef()
+ defer mroot.DecRef(ctx)
// Format:
// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
@@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
if root == nil {
return // No longer valid.
}
- defer root.DecRef()
+ defer root.DecRef(ctx)
flags := root.Inode.MountSource.Flags
opts := "rw"
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index bd18177d4..83a43aa26 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
}
sfile := s.(*fs.File)
if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
- s.DecRef()
+ s.DecRef(ctx)
// Not a unix socket.
continue
}
@@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
}
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
@@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
}
if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
- s.DecRef()
+ s.DecRef(ctx)
// Not tcp4 sockets.
continue
}
@@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
@@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
}
if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
- s.DecRef()
+ s.DecRef(ctx)
// Not udp4 socket.
continue
}
@@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
fmt.Fprintf(&buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
data := []seqfile.SeqData{
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c659224a7..77e0e1d26 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent
// Add dot and dotdot.
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dot, dotdot := file.Dirent.GetDotAttrs(root)
names = append(names, ".", "..")
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 702fdd392..8615b60f0 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -272,6 +272,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled)
}
+// +stateify savable
+type tcpRecovery struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+ recovery inet.TCPLossRecovery
+}
+
+func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ts := &tcpRecovery{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ts, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{
+ tcpRecovery: r,
+ stack: r.stack,
+ }), nil
+}
+
+// +stateify savable
+type tcpRecoveryFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ tcpRecovery *tcpRecovery
+
+ stack inet.Stack `state:"wait"`
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ recovery, err := f.stack.TCPRecovery()
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = recovery
+ s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery)
+ n, err := dst.CopyOut(ctx, []byte(s))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = inet.TCPLossRecovery(v)
+ if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -351,6 +441,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem)
}
+ // Add tcp_recovery.
+ if _, err := s.TCPRecovery(); err == nil {
+ contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s)
+ }
+
d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4bbe90198..9cf7f2a62 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -185,7 +185,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry
// Serialize "." and "..".
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dot, dotdot := file.Dirent.GetDotAttrs(root)
if err := dirCtx.DirEmit(".", dot); err != nil {
@@ -295,7 +295,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
if err != nil {
return "", err
}
- defer exec.DecRef()
+ defer exec.DecRef(ctx)
return exec.PathnameWithDeleted(ctx), nil
}
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index bfa304552..f4fcddecb 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
}
// Remove our reference on the inode.
- inode.DecRef()
+ inode.DecRef(ctx)
return nil
}
@@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err
}
// Remove our reference on the inode.
- inode.DecRef()
+ inode.DecRef(ctx)
return nil
}
@@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F
// Create the Dirent and corresponding file.
created := fs.NewDirent(ctx, inode, name)
- defer created.DecRef()
+ defer created.DecRef(ctx)
return created.Inode.GetFile(ctx, created, flags)
}
@@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol
}
// Release implements fs.InodeOperation.Release.
-func (d *Dir) Release(_ context.Context) {
+func (d *Dir) Release(ctx context.Context) {
// Drop references on all children.
d.mu.Lock()
for _, i := range d.children {
- i.DecRef()
+ i.DecRef(ctx)
}
d.mu.Unlock()
}
@@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC
func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
@@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) {
// dropped when that dirent is destroyed.
inode.IncRef()
d := fs.NewTransientDirent(inode)
- defer d.DecRef()
+ defer d.DecRef(ctx)
file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true})
if err != nil {
return false, err
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
ser := &fs.CollectEntriesSerializer{}
if err := file.Readdir(ctx, ser); err != nil {
@@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
if err != nil {
return err
}
- inode.DecRef()
+ inode.DecRef(ctx)
}
// Be careful, we may have already grabbed this mutex above.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index a6ed8b2c5..3e0d1e07e 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) {
continue
}
root := mm.Root()
- defer mm.DecRef()
+ defer mm.DecRef(ctx)
for _, p := range test.subdirs {
maxTraversals := uint(0)
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 88c344089..f362ca9b6 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -55,7 +55,7 @@ type TimerOperations struct {
func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]")
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
tops := &TimerOperations{}
tops.timer = ktime.NewTimer(c, tops)
// Timerfds reject writes, but the Write flag must be set in order to
@@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
}
// Release implements fs.FileOperations.Release.
-func (t *TimerOperations) Release() {
+func (t *TimerOperations) Release(context.Context) {
t.timer.Destroy()
}
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index aaba35502..d4d613ea9 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File {
func TestGrow(t *testing.T) {
ctx := contexttest.Context(t)
f := newFile(ctx)
- defer f.DecRef()
+ defer f.DecRef(ctx)
abuf := bytes.Repeat([]byte{'a'}, 68)
n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0)
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 108654827..463f6189e 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -132,7 +132,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
d.mu.Lock()
defer d.mu.Unlock()
- d.master.DecRef()
+ d.master.DecRef(ctx)
if len(d.slaves) != 0 {
panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
}
@@ -263,7 +263,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
}
// masterClose is called when the master end of t is closed.
-func (d *dirInodeOperations) masterClose(t *Terminal) {
+func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
d.mu.Lock()
defer d.mu.Unlock()
@@ -277,7 +277,7 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
}
- s.DecRef()
+ s.DecRef(ctx)
delete(d.slaves, t.n)
d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
}
@@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt
func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
root := fs.RootFromContext(ctx)
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(ctx)
}
dirCtx := &fs.DirCtx{
Serializer: serializer,
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 8fe05ebe5..2d4d44bf3 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {}
func (superOperations) SaveInodeMapping(*fs.Inode, string) {}
// Destroy implements MountSourceOperations.Destroy.
-func (superOperations) Destroy() {}
+func (superOperations) Destroy(context.Context) {}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index fe07fa929..e00746017 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -75,7 +75,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
}
// Release implements fs.InodeOperations.Release.
-func (mi *masterInodeOperations) Release(ctx context.Context) {
+func (mi *masterInodeOperations) Release(context.Context) {
}
// Truncate implements fs.InodeOperations.Truncate.
@@ -120,9 +120,9 @@ type masterFileOperations struct {
var _ fs.FileOperations = (*masterFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (mf *masterFileOperations) Release() {
- mf.d.masterClose(mf.t)
- mf.t.DecRef()
+func (mf *masterFileOperations) Release(ctx context.Context) {
+ mf.d.masterClose(ctx, mf.t)
+ mf.t.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 9871f6fc6..7c7292687 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -71,7 +71,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
// Release implements fs.InodeOperations.Release.
func (si *slaveInodeOperations) Release(ctx context.Context) {
- si.t.DecRef()
+ si.t.DecRef(ctx)
}
// Truncate implements fs.InodeOperations.Truncate.
@@ -106,7 +106,7 @@ type slaveFileOperations struct {
var _ fs.FileOperations = (*slaveFileOperations)(nil)
// Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release() {
+func (sf *slaveFileOperations) Release(context.Context) {
}
// EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
index f37f979f1..66e949c95 100644
--- a/pkg/sentry/fs/user/BUILD
+++ b/pkg/sentry/fs/user/BUILD
@@ -4,15 +4,21 @@ package(licenses = ["notice"])
go_library(
name = "user",
- srcs = ["user.go"],
+ srcs = [
+ "path.go",
+ "user.go",
+ ],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/log",
"//pkg/sentry/fs",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
+ "//pkg/syserror",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
new file mode 100644
index 000000000..2f5a43b84
--- /dev/null
+++ b/pkg/sentry/fs/user/path.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+ "fmt"
+ "path"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolveExecutablePath resolves the given executable name given the working
+// dir and environment.
+func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) {
+ name := args.Filename
+ if len(name) == 0 {
+ if len(args.Argv) == 0 {
+ return "", fmt.Errorf("no filename or command provided")
+ }
+ name = args.Argv[0]
+ }
+
+ // Absolute paths can be used directly.
+ if path.IsAbs(name) {
+ return name, nil
+ }
+
+ // Paths with '/' in them should be joined to the working directory, or
+ // to the root if working directory is not set.
+ if strings.IndexByte(name, '/') > 0 {
+ wd := args.WorkingDirectory
+ if wd == "" {
+ wd = "/"
+ }
+ if !path.IsAbs(wd) {
+ return "", fmt.Errorf("working directory %q must be absolute", wd)
+ }
+ return path.Join(wd, name), nil
+ }
+
+ // Otherwise, We must lookup the name in the paths.
+ paths := getPath(args.Envv)
+ if kernel.VFS2Enabled {
+ f, err := resolveVFS2(ctx, args.Credentials, args.MountNamespaceVFS2, paths, name)
+ if err != nil {
+ return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+ }
+ return f, nil
+ }
+
+ f, err := resolve(ctx, args.MountNamespace, paths, name)
+ if err != nil {
+ return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+ }
+ return f, nil
+}
+
+func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name string) (string, error) {
+ root := fs.RootFromContext(ctx)
+ if root == nil {
+ // Caller has no root. Don't bother traversing anything.
+ return "", syserror.ENOENT
+ }
+ defer root.DecRef(ctx)
+ for _, p := range paths {
+ if !path.IsAbs(p) {
+ // Relative paths aren't safe, no one should be using them.
+ log.Warningf("Skipping relative path %q in $PATH", p)
+ continue
+ }
+
+ binPath := path.Join(p, name)
+ traversals := uint(linux.MaxSymlinkTraversals)
+ d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
+ if err == syserror.ENOENT || err == syserror.EACCES {
+ // Didn't find it here.
+ continue
+ }
+ if err != nil {
+ return "", err
+ }
+ defer d.DecRef(ctx)
+
+ // Check that it is a regular file.
+ if !fs.IsRegular(d.Inode.StableAttr) {
+ continue
+ }
+
+ // Check whether we can read and execute the found file.
+ if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+ log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+ continue
+ }
+ return path.Join("/", p, name), nil
+ }
+
+ // Couldn't find it.
+ return "", syserror.ENOENT
+}
+
+func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ for _, p := range paths {
+ if !path.IsAbs(p) {
+ // Relative paths aren't safe, no one should be using them.
+ log.Warningf("Skipping relative path %q in $PATH", p)
+ continue
+ }
+
+ binPath := path.Join(p, name)
+ pop := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(binPath),
+ FollowFinalSymlink: true,
+ }
+ opts := &vfs.OpenOptions{
+ FileExec: true,
+ Flags: linux.O_RDONLY,
+ }
+ dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+ if err == syserror.ENOENT || err == syserror.EACCES {
+ // Didn't find it here.
+ continue
+ }
+ if err != nil {
+ return "", err
+ }
+ dentry.DecRef(ctx)
+
+ return binPath, nil
+ }
+
+ // Couldn't find it.
+ return "", syserror.ENOENT
+}
+
+// getPath returns the PATH as a slice of strings given the environment
+// variables.
+func getPath(env []string) []string {
+ const prefix = "PATH="
+ for _, e := range env {
+ if strings.HasPrefix(e, prefix) {
+ return strings.Split(strings.TrimPrefix(e, prefix), ":")
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index fe7f67c00..936fd3932 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Package user contains methods for resolving filesystem paths based on the
+// user and their environment.
package user
import (
@@ -60,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
// doesn't exist we will return the default home directory.
return defaultHome, nil
}
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
// Check read permissions on the file.
if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
@@ -79,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
if err != nil {
return "", err
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
r := &fileReader{
Ctx: ctx,
@@ -103,7 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
const defaultHome = "/"
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
creds := auth.CredentialsFromContext(ctx)
@@ -121,7 +123,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
if err != nil {
return defaultHome, nil
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
r := &fileReaderVFS2{
ctx: ctx,
diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go
index 7d8e9ac7c..12b786224 100644
--- a/pkg/sentry/fs/user/user_test.go
+++ b/pkg/sentry/fs/user/user_test.go
@@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode
if err != nil {
return err
}
- defer etc.DecRef()
+ defer etc.DecRef(ctx)
switch mode.FileType() {
case 0:
// Don't create anything.
@@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode
if err != nil {
return err
}
- defer passwd.DecRef()
+ defer passwd.DecRef(ctx)
if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil {
return err
}
@@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) {
if err != nil {
t.Fatalf("NewMountNamespace failed: %v", err)
}
- defer mns.DecRef()
+ defer mns.DecRef(ctx)
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
ctx = fs.WithRoot(ctx, root)
if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil {
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
index 8e7590721..7e61209ee 100644
--- a/pkg/sentry/fsbridge/bridge.go
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -44,7 +44,7 @@ type File interface {
IncRef()
// DecRef decrements reference.
- DecRef()
+ DecRef(ctx context.Context)
}
// Lookup provides a common interface to open files.
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
index 093ce1fb3..9785fd62a 100644
--- a/pkg/sentry/fsbridge/fs.go
+++ b/pkg/sentry/fsbridge/fs.go
@@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
// global there.
return ""
}
- defer root.DecRef()
+ defer root.DecRef(ctx)
name, _ := f.file.Dirent.FullName(root)
return name
@@ -87,8 +87,8 @@ func (f *fsFile) IncRef() {
}
// DecRef implements File.
-func (f *fsFile) DecRef() {
- f.file.DecRef()
+func (f *fsFile) DecRef(ctx context.Context) {
+ f.file.DecRef(ctx)
}
// fsLookup implements Lookup interface using fs.File.
@@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio
if err != nil {
return nil, err
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
return nil, syserror.ELOOP
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 89168220a..323506d33 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File {
// PathnameWithDeleted implements File.
func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
root := vfs.RootFromContext(ctx)
- defer root.DecRef()
+ defer root.DecRef(ctx)
vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
@@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() {
}
// DecRef implements File.
-func (f *VFSFile) DecRef() {
- f.file.DecRef()
+func (f *VFSFile) DecRef(ctx context.Context) {
+ f.file.DecRef(ctx)
}
// FileDescription returns the FileDescription represented by f. It does not
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 585764223..93512c9b6 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -18,6 +18,7 @@ go_library(
"//pkg/context",
"//pkg/safemem",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index c03c65445..7169e91af 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -103,9 +103,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// rootInode is the root directory inode for the devpts mounts.
@@ -116,6 +116,8 @@ type rootInode struct {
kernfs.InodeNotSymlink
kernfs.OrderedChildren
+ locks vfs.FileLocks
+
// Keep a reference to this inode's dentry.
dentry kernfs.Dentry
@@ -183,7 +185,7 @@ func (i *rootInode) masterClose(t *Terminal) {
// Open implements kernfs.Inode.Open.
func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 7a7ce5d81..3bb397f71 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
@@ -34,6 +35,8 @@ type masterInode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+ locks vfs.FileLocks
+
// Keep a reference to this inode's dentry.
dentry kernfs.Dentry
@@ -55,16 +58,17 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
inode: mi,
t: t,
}
+ fd.LockFD.Init(&mi.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- mi.DecRef()
+ mi.DecRef(ctx)
return nil, err
}
return &fd.vfsfd, nil
}
// Stat implements kernfs.Inode.Stat.
-func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -85,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
type masterFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
inode *masterInode
t *Terminal
@@ -93,9 +98,9 @@ type masterFileDescription struct {
var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (mfd *masterFileDescription) Release() {
+func (mfd *masterFileDescription) Release(ctx context.Context) {
mfd.inode.root.masterClose(mfd.t)
- mfd.inode.DecRef()
+ mfd.inode.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -181,7 +186,17 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO
// Stat implements vfs.FileDescriptionImpl.Stat.
func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return mfd.inode.Stat(fs, opts)
+ return mfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence)
}
// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index 526cd406c..32e4e1908 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -33,6 +34,8 @@ type slaveInode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+ locks vfs.FileLocks
+
// Keep a reference to this inode's dentry.
dentry kernfs.Dentry
@@ -51,8 +54,9 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
fd := &slaveFileDescription{
inode: si,
}
+ fd.LockFD.Init(&si.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- si.DecRef()
+ si.DecRef(ctx)
return nil, err
}
return &fd.vfsfd, nil
@@ -69,8 +73,8 @@ func (si *slaveInode) Valid(context.Context) bool {
}
// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := si.InodeAttrs.Stat(vfsfs, opts)
+func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
if err != nil {
return linux.Statx{}, err
}
@@ -91,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
type slaveFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
inode *slaveInode
}
@@ -98,8 +103,8 @@ type slaveFileDescription struct {
var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release() {
- sfd.inode.DecRef()
+func (sfd *slaveFileDescription) Release(ctx context.Context) {
+ sfd.inode.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -127,7 +132,7 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen
return sfd.inode.t.ld.outputQueueWrite(ctx, src)
}
-// Ioctl implements vfs.FileDescripionImpl.Ioctl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
@@ -178,5 +183,15 @@ func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOp
// Stat implements vfs.FileDescriptionImpl.Stat.
func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.Stat(fs, opts)
+ return sfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 142ee53b0..2ed5fa8a9 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -92,9 +92,9 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth
}
// Release must be called when a is no longer in use.
-func (a *Accessor) Release() {
- a.root.DecRef()
- a.mntns.DecRef()
+func (a *Accessor) Release(ctx context.Context) {
+ a.root.DecRef(ctx)
+ a.mntns.DecRef(ctx)
}
// accessorContext implements context.Context by extending an existing
@@ -136,6 +136,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
// CreateDeviceFile creates a device special file at the given pathname in the
// devtmpfs instance accessed by the Accessor.
func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+ actx := a.wrapContext(ctx)
+
mode := (linux.FileMode)(perms)
switch kind {
case vfs.BlockDevice:
@@ -145,12 +147,24 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
default:
panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
}
+
+ // Create any parent directories. See
+ // devtmpfs.c:handle_create()=>path_create().
+ for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
+ pop := a.pathOperationAt(it.String())
+ if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
+ }
+ }
+
// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
// create, which it recognizes by storing a pointer to the kdevtmpfs struct
// thread in struct inode::i_private. Accessor doesn't yet support deletion
// of files at all, and probably won't as long as we don't need to support
// kernel modules, so this is moot for now.
- return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+ return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
Mode: mode,
DevMajor: major,
DevMinor: minor,
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index b6d52c015..747867cca 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -30,7 +30,7 @@ func TestDevtmpfs(t *testing.T) {
creds := auth.CredentialsFromContext(ctx)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Register tmpfs just so that we can have a root filesystem that isn't
@@ -48,9 +48,9 @@ func TestDevtmpfs(t *testing.T) {
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
devpop := vfs.PathOperation{
Root: root,
Start: root,
@@ -69,7 +69,7 @@ func TestDevtmpfs(t *testing.T) {
if err != nil {
t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
}
- defer a.Release()
+ defer a.Release(ctx)
// Create "userspace-initialized" files using a devtmpfs.Accessor.
if err := a.UserspaceInit(ctx); err != nil {
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index c573d7935..812171fa3 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -37,6 +37,7 @@ type EventFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// queue is used to notify interested parties when the event object
// becomes readable or writable.
@@ -58,9 +59,9 @@ type EventFileDescription struct {
var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)
// New creates a new event fd.
-func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
efd := &EventFileDescription{
val: initVal,
semMode: semMode,
@@ -106,7 +107,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
}
// Release implements FileDescriptionImpl.Release()
-func (efd *EventFileDescription) Release() {
+func (efd *EventFileDescription) Release(context.Context) {
efd.mu.Lock()
defer efd.mu.Unlock()
if efd.hostfd >= 0 {
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
index 20e3adffc..49916fa81 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
@@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) {
for _, initVal := range initVals {
ctx := contexttest.Context(t)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Make a new eventfd that is writable.
- eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR)
+ eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR)
if err != nil {
t.Fatalf("New() failed: %v", err)
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(ctx)
// Register a callback for a write event.
w, ch := waiter.NewChannelEntry(nil)
@@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) {
func TestEventFDStat(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
// Make a new eventfd that is writable.
- eventfd, err := New(vfsObj, 0, false, linux.O_RDWR)
+ eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR)
if err != nil {
t.Fatalf("New() failed: %v", err)
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(ctx)
statx, err := eventfd.Stat(ctx, vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index ff861d0fe..abc610ef3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -54,6 +54,7 @@ go_library(
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/ext/disklayout",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
@@ -95,7 +96,7 @@ go_test(
"//pkg/syserror",
"//pkg/test/testutil",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
- "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 89caee3df..8f7d5a9bb 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -53,7 +53,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
return nil, nil, nil, nil, err
}
vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
@@ -68,7 +68,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
root := mntns.Root()
tearDown := func() {
- root.DecRef()
+ root.DecRef(ctx)
if err := f.Close(); err != nil {
b.Fatalf("tearDown failed: %v", err)
@@ -169,7 +169,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
// Create extfs submount.
mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop)
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index a2d8c3ad6..8bb104ff0 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -58,15 +58,16 @@ var _ io.ReaderAt = (*blockMapFile)(nil)
// newBlockMapFile is the blockMapFile constructor. It initializes the file to
// physical blocks map with (at most) the first 12 (direct) blocks.
-func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
- file := &blockMapFile{regFile: regFile}
+func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
+ file := &blockMapFile{}
file.regFile.impl = file
+ file.regFile.inode.init(args, &file.regFile)
for i := uint(0); i < 4; i++ {
- file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+ file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i)
}
- blkMap := regFile.inode.diskInode.Data()
+ blkMap := file.regFile.inode.diskInode.Data()
binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 181727ef7..6fa84e7aa 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -85,20 +85,6 @@ func (n *blkNumGen) next() uint32 {
// the inode covers and that is written to disk.
func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
mockDisk := make([]byte, mockBMDiskSize)
- regFile := regularFile{
- inode: inode{
- fs: &filesystem{
- dev: bytes.NewReader(mockDisk),
- },
- diskInode: &disklayout.InodeNew{
- InodeOld: disklayout.InodeOld{
- SizeLo: getMockBMFileFize(),
- },
- },
- blkSize: uint64(mockBMBlkSize),
- },
- }
-
var fileData []byte
blkNums := newBlkNumGen()
var data []byte
@@ -125,9 +111,20 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
- copy(regFile.inode.diskInode.Data(), data)
+ args := inodeArgs{
+ fs: &filesystem{
+ dev: bytes.NewReader(mockDisk),
+ },
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: getMockBMFileFize(),
+ },
+ },
+ blkSize: uint64(mockBMBlkSize),
+ }
+ copy(args.diskInode.Data(), data)
- mockFile, err := newBlockMapFile(regFile)
+ mockFile, err := newBlockMapFile(args)
if err != nil {
t.Fatalf("newBlockMapFile failed: %v", err)
}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index bfbd7c3d4..7a1b4219f 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -15,6 +15,7 @@
package ext
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -55,8 +56,25 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
+func (d *dentry) DecRef(ctx context.Context) {
// FIXME(b/134676337): filesystem.mu may not be locked as required by
// inode.decRef().
d.inode.decRef()
}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+ return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) OnZeroWatches(context.Context) {}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 12b875c8f..0fc01668d 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -54,16 +55,15 @@ type directory struct {
}
// newDirectory is the directory constructor.
-func newDirectory(inode inode, newDirent bool) (*directory, error) {
+func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
file := &directory{
- inode: inode,
childCache: make(map[string]*dentry),
childMap: make(map[string]*dirent),
}
- file.inode.impl = file
+ file.inode.init(args, file)
// Initialize childList by reading dirents from the underlying file.
- if inode.diskInode.Flags().Index {
+ if args.diskInode.Flags().Index {
// TODO(b/134676337): Support hash tree directories. Currently only the '.'
// and '..' entries are read in.
@@ -74,7 +74,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
// The dirents are organized in a linear array in the file data.
// Extract the file data and decode the dirents.
- regFile, err := newRegularFile(inode)
+ regFile, err := newRegularFile(args)
if err != nil {
return nil, err
}
@@ -82,7 +82,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) {
// buf is used as scratch space for reading in dirents from disk and
// unmarshalling them into dirent structs.
buf := make([]byte, disklayout.DirentSize)
- size := inode.diskInode.Size()
+ size := args.diskInode.Size()
for off, inc := uint64(0), uint64(0); off < size; off += inc {
toRead := size - off
if toRead > disklayout.DirentSize {
@@ -142,7 +142,7 @@ type directoryFD struct {
var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
if fd.iter == nil {
return
}
@@ -306,3 +306,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
fd.off = offset
return offset, nil
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index dac6effbf..08ffc2834 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -123,32 +123,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs.vfsfs.Init(vfsObj, &fsType, &fs)
fs.sb, err = readSuperBlock(dev)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
// mount(2) specifies that EINVAL should be returned if the superblock is
// invalid.
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
}
// Refuse to mount if the filesystem is incompatible.
if !isCompatible(fs.sb) {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
}
fs.bgs, err = readBlockGroups(dev, fs.sb)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
if err != nil {
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
rootInode.incRef()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 64e9a579f..2dbaee287 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -65,7 +65,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
@@ -80,7 +80,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
root := mntns.Root()
tearDown := func() {
- root.DecRef()
+ root.DecRef(ctx)
if err := f.Close(); err != nil {
t.Fatalf("tearDown failed: %v", err)
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 11dcc0346..c36225a7c 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -38,9 +38,10 @@ var _ io.ReaderAt = (*extentFile)(nil)
// newExtentFile is the extent file constructor. It reads the entire extent
// tree into memory.
// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(regFile regularFile) (*extentFile, error) {
- file := &extentFile{regFile: regFile}
+func newExtentFile(args inodeArgs) (*extentFile, error) {
+ file := &extentFile{}
file.regFile.impl = file
+ file.regFile.inode.init(args, &file.regFile)
err := file.buildExtTree()
if err != nil {
return nil, err
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index a2382daa3..cd10d46ee 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -177,21 +177,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
t.Helper()
mockDisk := make([]byte, mockExtentBlkSize*10)
- mockExtentFile := &extentFile{
- regFile: regularFile{
- inode: inode{
- fs: &filesystem{
- dev: bytes.NewReader(mockDisk),
- },
- diskInode: &disklayout.InodeNew{
- InodeOld: disklayout.InodeOld{
- SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
- },
- },
- blkSize: mockExtentBlkSize,
+ mockExtentFile := &extentFile{}
+ args := inodeArgs{
+ fs: &filesystem{
+ dev: bytes.NewReader(mockDisk),
+ },
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
},
},
+ blkSize: mockExtentBlkSize,
}
+ mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile)
fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 92f7da40d..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,6 +26,7 @@ import (
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
}
func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 557963e03..c714ddf73 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -84,7 +84,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
// - filesystem.mu must be locked (for writing if write param is true).
// - !rp.Done().
// - inode == vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
}
@@ -100,7 +100,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
}
d := vfsd.Impl().(*dentry)
if name == ".." {
- isRoot, err := rp.CheckRoot(vfsd)
+ isRoot, err := rp.CheckRoot(ctx, vfsd)
if err != nil {
return nil, nil, err
}
@@ -108,7 +108,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
rp.Advance()
return vfsd, inode, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, nil, err
}
rp.Advance()
@@ -143,7 +143,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
child.name = name
dir.childCache[name] = child
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, nil, err
}
if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
@@ -167,12 +167,12 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
//
// Preconditions:
// - filesystem.mu must be locked (for writing if write param is true).
-func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
for !rp.Done() {
var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
if err != nil {
return nil, nil, err
}
@@ -196,12 +196,12 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error)
// Preconditions:
// - filesystem.mu must be locked (for writing if write param is true).
// - !rp.Done().
-func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
for !rp.Final() {
var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
if err != nil {
return nil, nil, err
}
@@ -216,7 +216,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e
// the rp till the parent of the last component which should be an existing
// directory. If parent is false then resolves rp entirely. Attemps to resolve
// the path as far as it can with a read lock and upgrades the lock if needed.
-func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
var (
vfsd *vfs.Dentry
inode *inode
@@ -227,9 +227,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
// of disk. This reduces congestion (allows concurrent walks).
fs.mu.RLock()
if parent {
- vfsd, inode, err = walkParentLocked(rp, false)
+ vfsd, inode, err = walkParentLocked(ctx, rp, false)
} else {
- vfsd, inode, err = walkLocked(rp, false)
+ vfsd, inode, err = walkLocked(ctx, rp, false)
}
fs.mu.RUnlock()
@@ -238,9 +238,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
// walk is fine as this is a read only filesystem.
fs.mu.Lock()
if parent {
- vfsd, inode, err = walkParentLocked(rp, true)
+ vfsd, inode, err = walkParentLocked(ctx, rp, true)
} else {
- vfsd, inode, err = walkLocked(rp, true)
+ vfsd, inode, err = walkLocked(ctx, rp, true)
}
fs.mu.Unlock()
}
@@ -283,7 +283,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) {
// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -292,7 +292,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
- vfsd, inode, err := fs.walk(rp, false)
+ vfsd, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -312,7 +312,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
- vfsd, inode, err := fs.walk(rp, true)
+ vfsd, inode, err := fs.walk(ctx, rp, true)
if err != nil {
return nil, err
}
@@ -322,7 +322,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- vfsd, inode, err := fs.walk(rp, false)
+ vfsd, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -336,7 +336,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
}
@@ -349,7 +349,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return linux.Statx{}, err
}
@@ -360,7 +360,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
- if _, _, err := fs.walk(rp, false); err != nil {
+ if _, _, err := fs.walk(ctx, rp, false); err != nil {
return linux.Statfs{}, err
}
@@ -370,7 +370,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}
@@ -390,7 +390,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EEXIST
}
- if _, _, err := fs.walk(rp, true); err != nil {
+ if _, _, err := fs.walk(ctx, rp, true); err != nil {
return err
}
@@ -403,7 +403,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
- if _, _, err := fs.walk(rp, true); err != nil {
+ if _, _, err := fs.walk(ctx, rp, true); err != nil {
return err
}
@@ -416,7 +416,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EEXIST
}
- _, _, err := fs.walk(rp, true)
+ _, _, err := fs.walk(ctx, rp, true)
if err != nil {
return err
}
@@ -430,7 +430,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return syserror.ENOENT
}
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -440,7 +440,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -454,7 +454,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -468,7 +468,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return syserror.EEXIST
}
- _, _, err := fs.walk(rp, true)
+ _, _, err := fs.walk(ctx, rp, true)
if err != nil {
return err
}
@@ -478,7 +478,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -492,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
- _, inode, err := fs.walk(rp, false)
+ _, inode, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -506,7 +506,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
}
@@ -515,7 +515,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
}
@@ -524,7 +524,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
@@ -533,7 +533,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
- _, _, err := fs.walk(rp, false)
+ _, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 485f86f4b..30636cf66 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -54,6 +54,8 @@ type inode struct {
// diskInode gives us access to the inode struct on disk. Immutable.
diskInode disklayout.Inode
+ locks vfs.FileLocks
+
// This is immutable. The first field of the implementations must have inode
// as the first field to ensure temporality.
impl interface{}
@@ -115,7 +117,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
}
// Build the inode based on its type.
- inode := inode{
+ args := inodeArgs{
fs: fs,
inodeNum: inodeNum,
blkSize: blkSize,
@@ -124,19 +126,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
switch diskInode.Mode().FileType() {
case linux.ModeSymlink:
- f, err := newSymlink(inode)
+ f, err := newSymlink(args)
if err != nil {
return nil, err
}
return &f.inode, nil
case linux.ModeRegular:
- f, err := newRegularFile(inode)
+ f, err := newRegularFile(args)
if err != nil {
return nil, err
}
return &f.inode, nil
case linux.ModeDirectory:
- f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+ f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType)
if err != nil {
return nil, err
}
@@ -147,6 +149,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
}
}
+type inodeArgs struct {
+ fs *filesystem
+ inodeNum uint32
+ blkSize uint64
+ diskInode disklayout.Inode
+}
+
+func (in *inode) init(args inodeArgs, impl interface{}) {
+ in.fs = args.fs
+ in.inodeNum = args.inodeNum
+ in.blkSize = args.blkSize
+ in.diskInode = args.diskInode
+ in.impl = impl
+}
+
// open creates and returns a file description for the dentry passed in.
func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
@@ -157,6 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
switch in.impl.(type) {
case *regularFile:
var fd regularFileFD
+ fd.LockFD.Init(&in.locks)
if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -168,6 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
return nil, syserror.EISDIR
}
var fd directoryFD
+ fd.LockFD.Init(&in.locks)
if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -178,6 +197,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
return nil, syserror.ELOOP
}
var fd symlinkFD
+ fd.LockFD.Init(&in.locks)
fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
default:
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index 30135ddb0..e73e740d6 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -43,28 +44,19 @@ type regularFile struct {
// newRegularFile is the regularFile constructor. It figures out what kind of
// file this is and initializes the fileReader.
-func newRegularFile(inode inode) (*regularFile, error) {
- regFile := regularFile{
- inode: inode,
- }
-
- inodeFlags := inode.diskInode.Flags()
-
- if inodeFlags.Extents {
- file, err := newExtentFile(regFile)
+func newRegularFile(args inodeArgs) (*regularFile, error) {
+ if args.diskInode.Flags().Extents {
+ file, err := newExtentFile(args)
if err != nil {
return nil, err
}
-
- file.regFile.inode.impl = &file.regFile
return &file.regFile, nil
}
- file, err := newBlockMapFile(regFile)
+ file, err := newBlockMapFile(args)
if err != nil {
return nil, err
}
- file.regFile.inode.impl = &file.regFile
return &file.regFile, nil
}
@@ -77,6 +69,7 @@ func (in *inode) isRegular() bool {
// vfs.FileDescriptionImpl.
type regularFileFD struct {
fileDescription
+ vfs.LockFD
// off is the file offset. off is accessed using atomic memory operations.
off int64
@@ -86,7 +79,7 @@ type regularFileFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {}
+func (fd *regularFileFD) Release(context.Context) {}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
@@ -157,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
// TODO(b/134676337): Implement mmap(2).
return syserror.ENODEV
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 1447a4dc1..2fd0d1fa8 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -30,18 +30,17 @@ type symlink struct {
// newSymlink is the symlink constructor. It reads out the symlink target from
// the inode (however it might have been stored).
-func newSymlink(inode inode) (*symlink, error) {
- var file *symlink
+func newSymlink(args inodeArgs) (*symlink, error) {
var link []byte
// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
// Otherwise either extents or block maps will be used to store the link.
- size := inode.diskInode.Size()
+ size := args.diskInode.Size()
if size < 60 {
- link = inode.diskInode.Data()[:size]
+ link = args.diskInode.Data()[:size]
} else {
// Create a regular file out of this inode and read out the target.
- regFile, err := newRegularFile(inode)
+ regFile, err := newRegularFile(args)
if err != nil {
return nil, err
}
@@ -52,8 +51,8 @@ func newSymlink(inode inode) (*symlink, error) {
}
}
- file = &symlink{inode: inode, target: string(link)}
- file.inode.impl = file
+ file := &symlink{target: string(link)}
+ file.inode.init(args, file)
return file, nil
}
@@ -67,13 +66,14 @@ func (in *inode) isSymlink() bool {
// O_PATH. For this reason most of the functions return EBADF.
type symlinkFD struct {
fileDescription
+ vfs.NoLockFD
}
// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *symlinkFD) Release() {}
+func (fd *symlinkFD) Release(context.Context) {}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
new file mode 100644
index 000000000..999111deb
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -0,0 +1,63 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+ name = "request_list",
+ out = "request_list.go",
+ package = "fuse",
+ prefix = "request",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Request",
+ "Linker": "*Request",
+ },
+)
+
+go_library(
+ name = "fuse",
+ srcs = [
+ "connection.go",
+ "dev.go",
+ "fusefs.go",
+ "init.go",
+ "register.go",
+ "request_list.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/log",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/kernfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "fuse_test",
+ size = "small",
+ srcs = ["dev_test.go"],
+ library = ":fuse",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/fsimpl/testutil",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
new file mode 100644
index 000000000..6df2728ab
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -0,0 +1,437 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "errors"
+ "fmt"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+const (
+ // fuseDefaultMaxBackground is the default value for MaxBackground.
+ fuseDefaultMaxBackground = 12
+
+ // fuseDefaultCongestionThreshold is the default value for CongestionThreshold,
+ // and is 75% of the default maximum of MaxGround.
+ fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4)
+
+ // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq.
+ fuseDefaultMaxPagesPerReq = 32
+)
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+ requestEntry
+
+ id linux.FUSEOpID
+ hdr *linux.FUSEHeaderIn
+ data []byte
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+ opcode linux.FUSEOpcode
+ hdr linux.FUSEHeaderOut
+ data []byte
+}
+
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
+type connection struct {
+ fd *DeviceFD
+
+ // The following FUSE_INIT flags are currently unsupported by this implementation:
+ // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
+ // - FUSE_EXPORT_SUPPORT
+ // - FUSE_HANDLE_KILLPRIV
+ // - FUSE_POSIX_LOCKS: requires POSIX locks
+ // - FUSE_FLOCK_LOCKS: requires POSIX locks
+ // - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+ // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
+ // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
+ // - FUSE_ASYNC_DIO
+ // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+
+ // initialized after receiving FUSE_INIT reply.
+ // Until it's set, suspend sending FUSE requests.
+ // Use SetInitialized() and IsInitialized() for atomic access.
+ initialized int32
+
+ // initializedChan is used to block requests before initialization.
+ initializedChan chan struct{}
+
+ // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
+ // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
+ blocked bool
+
+ // connected (connection established) when a new FUSE file system is created.
+ // Set to false when:
+ // umount,
+ // connection abort,
+ // device release.
+ connected bool
+
+ // aborted via sysfs.
+ // TODO(gvisor.dev/issue/3185): abort all queued requests.
+ aborted bool
+
+ // connInitError if FUSE_INIT encountered error (major version mismatch).
+ // Only set in INIT.
+ connInitError bool
+
+ // connInitSuccess if FUSE_INIT is successful.
+ // Only set in INIT.
+ // Used for destory.
+ connInitSuccess bool
+
+ // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
+
+ // NumberBackground is the number of requests in the background.
+ numBackground uint16
+
+ // congestionThreshold for NumBackground.
+ // Negotiated in FUSE_INIT.
+ congestionThreshold uint16
+
+ // maxBackground is the maximum number of NumBackground.
+ // Block connection when it is reached.
+ // Negotiated in FUSE_INIT.
+ maxBackground uint16
+
+ // numActiveBackground is the number of requests in background and has being marked as active.
+ numActiveBackground uint16
+
+ // numWating is the number of requests waiting for completion.
+ numWaiting uint32
+
+ // TODO(gvisor.dev/issue/3185): BgQueue
+ // some queue for background queued requests.
+
+ // bgLock protects:
+ // MaxBackground, CongestionThreshold, NumBackground,
+ // NumActiveBackground, BgQueue, Blocked.
+ bgLock sync.Mutex
+
+ // maxRead is the maximum size of a read buffer in in bytes.
+ maxRead uint32
+
+ // maxWrite is the maximum size of a write buffer in bytes.
+ // Negotiated in FUSE_INIT.
+ maxWrite uint32
+
+ // maxPages is the maximum number of pages for a single request to use.
+ // Negotiated in FUSE_INIT.
+ maxPages uint16
+
+ // minor version of the FUSE protocol.
+ // Negotiated and only set in INIT.
+ minor uint32
+
+ // asyncRead if read pages asynchronously.
+ // Negotiated and only set in INIT.
+ asyncRead bool
+
+ // abortErr is true if kernel need to return an unique read error after abort.
+ // Negotiated and only set in INIT.
+ abortErr bool
+
+ // writebackCache is true for write-back cache policy,
+ // false for write-through policy.
+ // Negotiated and only set in INIT.
+ writebackCache bool
+
+ // cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
+ // Negotiated and only set in INIT.
+ cacheSymlinks bool
+
+ // bigWrites if doing multi-page cached writes.
+ // Negotiated and only set in INIT.
+ bigWrites bool
+
+ // dontMask if filestestem does not apply umask to creation modes.
+ // Negotiated in INIT.
+ dontMask bool
+}
+
+// newFUSEConnection creates a FUSE connection to fd.
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+ // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
+ // mount a FUSE filesystem.
+ fuseFD := fd.Impl().(*DeviceFD)
+ fuseFD.mounted = true
+
+ // Create the writeBuf for the header to be stored in.
+ hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ fuseFD.writeBuf = make([]byte, hdrLen)
+ fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
+ fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+ fuseFD.writeCursor = 0
+
+ return &connection{
+ fd: fuseFD,
+ maxBackground: fuseDefaultMaxBackground,
+ congestionThreshold: fuseDefaultCongestionThreshold,
+ maxPages: fuseDefaultMaxPagesPerReq,
+ initializedChan: make(chan struct{}),
+ connected: true,
+ }, nil
+}
+
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+ // Unblock the requests sent before INIT.
+ close(conn.initializedChan)
+
+ // Close the channel first to avoid the non-atomic situation
+ // where conn.initialized is true but there are
+ // tasks being blocked on the channel.
+ // And it prevents the newer tasks from gaining
+ // unnecessary higher chance to be issued before the blocked one.
+
+ atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+ return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+ conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+ hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+ hdr := linux.FUSEHeaderIn{
+ Len: uint32(hdrLen + payload.SizeBytes()),
+ Opcode: opcode,
+ Unique: conn.fd.nextOpID,
+ NodeID: ino,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ PID: pid,
+ }
+
+ buf := make([]byte, hdr.Len)
+ hdr.MarshalUnsafe(buf[:hdrLen])
+ payload.MarshalUnsafe(buf[hdrLen:])
+
+ return &Request{
+ id: hdr.Unique,
+ hdr: &hdr,
+ data: buf,
+ }, nil
+}
+
+// Call makes a request to the server and blocks the invoking task until a
+// server responds with a response. Task should never be nil.
+// Requests will not be sent before the connection is initialized.
+// For async tasks, use CallAsync().
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
+ // Block requests sent before connection is initalized.
+ if !conn.Initialized() {
+ if err := t.Block(conn.initializedChan); err != nil {
+ return nil, err
+ }
+ }
+
+ return conn.call(t, r)
+}
+
+// CallAsync makes an async (aka background) request.
+// Those requests either do not expect a response (e.g. release) or
+// the response should be handled by others (e.g. init).
+// Return immediately unless the connection is blocked (before initialization).
+// Async call example: init, release, forget, aio, interrupt.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+ // Block requests sent before connection is initalized.
+ if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+ if err := t.Block(conn.initializedChan); err != nil {
+ return err
+ }
+ }
+
+ // This should be the only place that invokes call() with a nil task.
+ _, err := conn.call(nil, r)
+ return err
+}
+
+// call makes a call without blocking checks.
+func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
+ if !conn.connected {
+ return nil, syserror.ENOTCONN
+ }
+
+ if conn.connInitError {
+ return nil, syserror.ECONNREFUSED
+ }
+
+ fut, err := conn.callFuture(t, r)
+ if err != nil {
+ return nil, err
+ }
+
+ return fut.resolve(t)
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+ errno := r.hdr.Error
+ if errno >= 0 {
+ return nil
+ }
+
+ sysErrNo := syscall.Errno(-errno)
+ return error(sysErrNo)
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+ hdrLen := r.hdr.SizeBytes()
+ haveDataLen := r.hdr.Len - uint32(hdrLen)
+ wantDataLen := uint32(m.SizeBytes())
+
+ if haveDataLen < wantDataLen {
+ return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
+ }
+
+ m.UnmarshalUnsafe(r.data[hdrLen:])
+ return nil
+}
+
+// callFuture makes a request to the server and returns a future response.
+// Call resolve() when the response needs to be fulfilled.
+func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+
+ // Is the queue full?
+ //
+ // We must busy wait here until the request can be queued. We don't
+ // block on the fd.fullQueueCh with a lock - so after being signalled,
+ // before we acquire the lock, it is possible that a barging task enters
+ // and queues a request. As a result, upon acquiring the lock we must
+ // again check if the room is available.
+ //
+ // This can potentially starve a request forever but this can only happen
+ // if there are always too many ongoing requests all the time. The
+ // supported maxActiveRequests setting should be really high to avoid this.
+ for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+ if t == nil {
+ // Since there is no task that is waiting. We must error out.
+ return nil, errors.New("FUSE request queue full")
+ }
+
+ log.Infof("Blocking request %v from being queued. Too many active requests: %v",
+ r.id, conn.fd.numActiveRequests)
+ conn.fd.mu.Unlock()
+ err := t.Block(conn.fd.fullQueueCh)
+ conn.fd.mu.Lock()
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return conn.callFutureLocked(t, r)
+}
+
+// callFutureLocked makes a request to the server and returns a future response.
+func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+ conn.fd.queue.PushBack(r)
+ conn.fd.numActiveRequests += 1
+ fut := newFutureResponse(r.hdr.Opcode)
+ conn.fd.completions[r.id] = fut
+
+ // Signal the readers that there is something to read.
+ conn.fd.waitQueue.Notify(waiter.EventIn)
+
+ return fut, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+ opcode linux.FUSEOpcode
+ ch chan struct{}
+ hdr *linux.FUSEHeaderOut
+ data []byte
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
+ return &futureResponse{
+ opcode: opcode,
+ ch: make(chan struct{}),
+ }
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+ // If there is no Task associated with this request - then we don't try to resolve
+ // the response. Instead, the task writing the response (proxy to the server) will
+ // process the response on our behalf.
+ if t == nil {
+ log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
+ return nil, nil
+ }
+
+ if err := t.Block(f.ch); err != nil {
+ return nil, err
+ }
+
+ return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+ return &Response{
+ opcode: f.opcode,
+ hdr: *f.hdr,
+ data: f.data,
+ }
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
new file mode 100644
index 000000000..e522ff9a0
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -0,0 +1,397 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+const fuseDevMinor = 229
+
+// fuseDevice implements vfs.Device for /dev/fuse.
+type fuseDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if !kernel.FUSEEnabled {
+ return nil, syserror.ENOENT
+ }
+
+ var fd DeviceFD
+ if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+type DeviceFD struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
+ mounted bool
+
+ // nextOpID is used to create new requests.
+ nextOpID linux.FUSEOpID
+
+ // queue is the list of requests that need to be processed by the FUSE server.
+ queue requestList
+
+ // numActiveRequests is the number of requests made by the Sentry that has
+ // yet to be responded to.
+ numActiveRequests uint64
+
+ // completions is used to map a request to its response. A Writer will use this
+ // to notify the caller of a completed response.
+ completions map[linux.FUSEOpID]*futureResponse
+
+ writeCursor uint32
+
+ // writeBuf is the memory buffer used to copy in the FUSE out header from
+ // userspace.
+ writeBuf []byte
+
+ // writeCursorFR current FR being copied from server.
+ writeCursorFR *futureResponse
+
+ // mu protects all the queues, maps, buffers and cursors and nextOpID.
+ mu sync.Mutex
+
+ // waitQueue is used to notify interested parties when the device becomes
+ // readable or writable.
+ waitQueue waiter.Queue
+
+ // fullQueueCh is a channel used to synchronize the readers with the writers.
+ // Writers (inbound requests to the filesystem) block if there are too many
+ // unprocessed in-flight requests.
+ fullQueueCh chan struct{}
+
+ // fs is the FUSE filesystem that this FD is being used for.
+ fs *filesystem
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DeviceFD) Release(context.Context) {
+ fd.fs.conn.connected = false
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if !fd.mounted {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if !fd.mounted {
+ return 0, syserror.EPERM
+ }
+
+ // We require that any Read done on this filesystem have a sane minimum
+ // read buffer. It must have the capacity for the fixed parts of any request
+ // header (Linux uses the request header and the FUSEWriteIn header for this
+ // calculation) + the negotiated MaxWrite room for the data.
+ minBuffSize := linux.FUSE_MIN_READ_BUFFER
+ inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+ writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
+ negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
+ if minBuffSize < negotiatedMinBuffSize {
+ minBuffSize = negotiatedMinBuffSize
+ }
+
+ // If the read buffer is too small, error out.
+ if dst.NumBytes() < int64(minBuffSize) {
+ return 0, syserror.EINVAL
+ }
+
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.readLocked(ctx, dst, opts)
+}
+
+// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ if fd.queue.Empty() {
+ return 0, syserror.ErrWouldBlock
+ }
+
+ var readCursor uint32
+ var bytesRead int64
+ for {
+ req := fd.queue.Front()
+ if dst.NumBytes() < int64(req.hdr.Len) {
+ // The request is too large. Cannot process it. All requests must be smaller than the
+ // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+ // handshake.
+ errno := -int32(syscall.EIO)
+ if req.hdr.Opcode == linux.FUSE_SETXATTR {
+ errno = -int32(syscall.E2BIG)
+ }
+
+ // Return the error to the calling task.
+ if err := fd.sendError(ctx, errno, req); err != nil {
+ return 0, err
+ }
+
+ // We're done with this request.
+ fd.queue.Remove(req)
+
+ // Restart the read as this request was invalid.
+ log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
+ return fd.readLocked(ctx, dst, opts)
+ }
+
+ n, err := dst.CopyOut(ctx, req.data[readCursor:])
+ if err != nil {
+ return 0, err
+ }
+ readCursor += uint32(n)
+ bytesRead += int64(n)
+
+ if readCursor >= req.hdr.Len {
+ // Fully done with this req, remove it from the queue.
+ fd.queue.Remove(req)
+ break
+ }
+ }
+
+ return bytesRead, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if !fd.mounted {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.writeLocked(ctx, src, opts)
+}
+
+// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if !fd.mounted {
+ return 0, syserror.EPERM
+ }
+
+ var cn, n int64
+ hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+
+ for src.NumBytes() > 0 {
+ if fd.writeCursorFR != nil {
+ // Already have common header, and we're now copying the payload.
+ wantBytes := fd.writeCursorFR.hdr.Len
+
+ // Note that the FR data doesn't have the header. Copy it over if its necessary.
+ if fd.writeCursorFR.data == nil {
+ fd.writeCursorFR.data = make([]byte, wantBytes)
+ }
+
+ bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
+ if err != nil {
+ return 0, err
+ }
+ src = src.DropFirst(bytesCopied)
+
+ cn = int64(bytesCopied)
+ n += cn
+ fd.writeCursor += uint32(cn)
+ if fd.writeCursor == wantBytes {
+ // Done reading this full response. Clean up and unblock the
+ // initiator.
+ break
+ }
+
+ // Check if we have more data in src.
+ continue
+ }
+
+ // Assert that the header isn't read into the writeBuf yet.
+ if fd.writeCursor >= hdrLen {
+ return 0, syserror.EINVAL
+ }
+
+ // We don't have the full common response header yet.
+ wantBytes := hdrLen - fd.writeCursor
+ bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
+ if err != nil {
+ return 0, err
+ }
+ src = src.DropFirst(bytesCopied)
+
+ cn = int64(bytesCopied)
+ n += cn
+ fd.writeCursor += uint32(cn)
+ if fd.writeCursor == hdrLen {
+ // Have full header in the writeBuf. Use it to fetch the actual futureResponse
+ // from the device's completions map.
+ var hdr linux.FUSEHeaderOut
+ hdr.UnmarshalBytes(fd.writeBuf)
+
+ // We have the header now and so the writeBuf has served its purpose.
+ // We could reset it manually here but instead of doing that, at the
+ // end of the write, the writeCursor will be set to 0 thereby allowing
+ // the next request to overwrite whats in the buffer,
+
+ fut, ok := fd.completions[hdr.Unique]
+ if !ok {
+ // Server sent us a response for a request we never sent?
+ return 0, syserror.EINVAL
+ }
+
+ delete(fd.completions, hdr.Unique)
+
+ // Copy over the header into the future response. The rest of the payload
+ // will be copied over to the FR's data in the next iteration.
+ fut.hdr = &hdr
+ fd.writeCursorFR = fut
+
+ // Next iteration will now try read the complete request, if src has
+ // any data remaining. Otherwise we're done.
+ }
+ }
+
+ if fd.writeCursorFR != nil {
+ if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
+ return 0, err
+ }
+
+ // Ready the device for the next request.
+ fd.writeCursorFR = nil
+ fd.writeCursor = 0
+ }
+
+ return n, nil
+}
+
+// Readiness implements vfs.FileDescriptionImpl.Readiness.
+func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ var ready waiter.EventMask
+ ready |= waiter.EventOut // FD is always writable
+ if !fd.queue.Empty() {
+ // Have reqs available, FD is readable.
+ ready |= waiter.EventIn
+ }
+
+ return ready & mask
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.waitQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
+ fd.waitQueue.EventUnregister(e)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+ if !fd.mounted {
+ return 0, syserror.EPERM
+ }
+
+ return 0, syserror.ENOSYS
+}
+
+// sendResponse sends a response to the waiting task (if any).
+func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
+ // See if the running task need to perform some action before returning.
+ // Since we just finished writing the future, we can be sure that
+ // getResponse generates a populated response.
+ if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
+ return err
+ }
+
+ // Signal that the queue is no longer full.
+ select {
+ case fd.fullQueueCh <- struct{}{}:
+ default:
+ }
+ fd.numActiveRequests -= 1
+
+ // Signal the task waiting on a response.
+ close(fut.ch)
+ return nil
+}
+
+// sendError sends an error response to the waiting task (if any).
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+ // Return the error to the calling task.
+ outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ respHdr := linux.FUSEHeaderOut{
+ Len: outHdrLen,
+ Error: errno,
+ Unique: req.hdr.Unique,
+ }
+
+ fut, ok := fd.completions[respHdr.Unique]
+ if !ok {
+ // Server sent us a response for a request we never sent?
+ return syserror.EINVAL
+ }
+ delete(fd.completions, respHdr.Unique)
+
+ fut.hdr = &respHdr
+ if err := fd.sendResponse(ctx, fut); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// noReceiverAction has the calling kernel.Task do some action if its known that no
+// receiver is going to be waiting on the future channel. This is to be used by:
+// FUSE_INIT.
+func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
+ if r.opcode == linux.FUSE_INIT {
+ creds := auth.CredentialsFromContext(ctx)
+ rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
+ return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
new file mode 100644
index 000000000..1ffe7ccd2
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "fmt"
+ "io"
+ "math/rand"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// echoTestOpcode is the Opcode used during testing. The server used in tests
+// will simply echo the payload back with the appropriate headers.
+const echoTestOpcode linux.FUSEOpcode = 1000
+
+type testPayload struct {
+ data uint32
+}
+
+// TestFUSECommunication tests that the communication layer between the Sentry and the
+// FUSE server daemon works as expected.
+func TestFUSECommunication(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+ creds := auth.CredentialsFromContext(s.Ctx)
+
+ // Create test cases with different number of concurrent clients and servers.
+ testCases := []struct {
+ Name string
+ NumClients int
+ NumServers int
+ MaxActiveRequests uint64
+ }{
+ {
+ Name: "SingleClientSingleServer",
+ NumClients: 1,
+ NumServers: 1,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "SingleClientMultipleServers",
+ NumClients: 1,
+ NumServers: 10,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "MultipleClientsSingleServer",
+ NumClients: 10,
+ NumServers: 1,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "MultipleClientsMultipleServers",
+ NumClients: 10,
+ NumServers: 10,
+ MaxActiveRequests: maxActiveRequestsDefault,
+ },
+ {
+ Name: "RequestCapacityFull",
+ NumClients: 10,
+ NumServers: 1,
+ MaxActiveRequests: 1,
+ },
+ {
+ Name: "RequestCapacityContinuouslyFull",
+ NumClients: 100,
+ NumServers: 2,
+ MaxActiveRequests: 2,
+ },
+ }
+
+ for _, testCase := range testCases {
+ t.Run(testCase.Name, func(t *testing.T) {
+ conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ clientsDone := make([]chan struct{}, testCase.NumClients)
+ serversDone := make([]chan struct{}, testCase.NumServers)
+ serversKill := make([]chan struct{}, testCase.NumServers)
+
+ // FUSE clients.
+ for i := 0; i < testCase.NumClients; i++ {
+ clientsDone[i] = make(chan struct{})
+ go func(i int) {
+ fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i])
+ }(i)
+ }
+
+ // FUSE servers.
+ for j := 0; j < testCase.NumServers; j++ {
+ serversDone[j] = make(chan struct{})
+ serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block.
+ go func(j int) {
+ fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j])
+ }(j)
+ }
+
+ // Tear down.
+ //
+ // Make sure all the clients are done.
+ for i := 0; i < testCase.NumClients; i++ {
+ <-clientsDone[i]
+ }
+
+ // Kill any server that is potentially waiting.
+ for j := 0; j < testCase.NumServers; j++ {
+ serversKill[j] <- struct{}{}
+ }
+
+ // Make sure all the servers are done.
+ for j := 0; j < testCase.NumServers; j++ {
+ <-serversDone[j]
+ }
+ })
+ }
+}
+
+// CallTest makes a request to the server and blocks the invoking
+// goroutine until a server responds with a response. Doesn't block
+// a kernel.Task. Analogous to Connection.Call but used for testing.
+func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) {
+ conn.fd.mu.Lock()
+
+ // Wait until we're certain that a new request can be processed.
+ for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+ conn.fd.mu.Unlock()
+ select {
+ case <-conn.fd.fullQueueCh:
+ }
+ conn.fd.mu.Lock()
+ }
+
+ fut, err := conn.callFutureLocked(t, r) // No task given.
+ conn.fd.mu.Unlock()
+
+ if err != nil {
+ return nil, err
+ }
+
+ // Resolve the response.
+ //
+ // Block without a task.
+ select {
+ case <-fut.ch:
+ }
+
+ // A response is ready. Resolve and return it.
+ return fut.getResponse(), nil
+}
+
+// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE
+// device. However, it does so by - not blocking the task that is calling - and
+// instead just waits on a channel. The behaviour is essentially the same as
+// DeviceFD.Read except it guarantees that the task is not blocked.
+func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) {
+ var err error
+ var n, total int64
+
+ dev := fd.Impl().(*DeviceFD)
+
+ // Register for notifications.
+ w, ch := waiter.NewChannelEntry(nil)
+ dev.EventRegister(&w, waiter.EventIn)
+ for {
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+
+ // Wait for a notification that we should retry.
+ // Emulate the blocking for when no requests are available
+ select {
+ case <-ch:
+ case <-killServer:
+ // Server killed by the main program.
+ return 0, true, nil
+ }
+ }
+
+ dev.EventUnregister(&w)
+ return total, false, err
+}
+
+// fuseClientRun emulates all the actions of a normal FUSE request. It creates
+// a header, a payload, calls the server, waits for the response, and processes
+// the response.
+func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) {
+ defer func() { clientDone <- struct{}{} }()
+
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root)
+ if err != nil {
+ t.Fatal(err)
+ }
+ testObj := &testPayload{
+ data: rand.Uint32(),
+ }
+
+ req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+
+ // Queue up a request.
+ // Analogous to Call except it doesn't block on the task.
+ resp, err := CallTest(conn, clientTask, req, pid)
+ if err != nil {
+ t.Fatalf("CallTaskNonBlock failed: %v", err)
+ }
+
+ if err = resp.Error(); err != nil {
+ t.Fatalf("Server responded with an error: %v", err)
+ }
+
+ var respTestPayload testPayload
+ if err := resp.UnmarshalPayload(&respTestPayload); err != nil {
+ t.Fatalf("Unmarshalling payload error: %v", err)
+ }
+
+ if resp.hdr.Unique != req.hdr.Unique {
+ t.Fatalf("got response for another request. Expected response for req %v but got response for req %v",
+ req.hdr.Unique, resp.hdr.Unique)
+ }
+
+ if respTestPayload.data != testObj.data {
+ t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data)
+ }
+
+}
+
+// fuseServerRun creates a task and emulates all the actions of a simple FUSE server
+// that simply reads a request and echos the same struct back as a response using the
+// appropriate headers.
+func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) {
+ defer func() { serverDone <- struct{}{} }()
+
+ // Create the tasks that the server will be using.
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ var readPayload testPayload
+
+ serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Read the request.
+ for {
+ inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+ payloadLen := uint32(readPayload.SizeBytes())
+
+ // The raed buffer must meet some certain size criteria.
+ buffSize := inHdrLen + payloadLen
+ if buffSize < linux.FUSE_MIN_READ_BUFFER {
+ buffSize = linux.FUSE_MIN_READ_BUFFER
+ }
+ inBuf := make([]byte, buffSize)
+ inIOseq := usermem.BytesIOSequence(inBuf)
+
+ n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer)
+ if err != nil {
+ t.Fatalf("Read failed :%v", err)
+ }
+
+ // Server should shut down. No new requests are going to be made.
+ if serverKilled {
+ break
+ }
+
+ if n <= 0 {
+ t.Fatalf("Read read no bytes")
+ }
+
+ var readFUSEHeaderIn linux.FUSEHeaderIn
+ readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen])
+ readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen])
+
+ if readFUSEHeaderIn.Opcode != echoTestOpcode {
+ t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload)
+ }
+
+ // Write the response.
+ outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+ outBuf := make([]byte, outHdrLen+payloadLen)
+ outHeader := linux.FUSEHeaderOut{
+ Len: outHdrLen + payloadLen,
+ Error: 0,
+ Unique: readFUSEHeaderIn.Unique,
+ }
+
+ // Echo the payload back.
+ outHeader.MarshalUnsafe(outBuf[:outHdrLen])
+ readPayload.MarshalUnsafe(outBuf[outHdrLen:])
+ outIOseq := usermem.BytesIOSequence(outBuf)
+
+ n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("Write failed :%v", err)
+ }
+ }
+}
+
+func setup(t *testing.T) *testutil.System {
+ k, err := testutil.Boot()
+ if err != nil {
+ t.Fatalf("Error creating kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+ creds := auth.CredentialsFromContext(ctx)
+
+ k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ AllowUserMount: true,
+ })
+
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ t.Fatalf("NewMountNamespace(): %v", err)
+ }
+
+ return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+ vfsObj := &vfs.VirtualFilesystem{}
+ fuseDev := &DeviceFD{}
+
+ if err := vfsObj.Init(system.Ctx); err != nil {
+ return nil, nil, err
+ }
+
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef(system.Ctx)
+ if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, nil, err
+ }
+
+ fsopts := filesystemOptions{
+ maxActiveRequests: maxActiveRequests,
+ }
+ fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return fs.conn, &fuseDev.vfsfd, nil
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+ return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+ usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+ *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+ return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+ t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+ t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+ panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+ panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
new file mode 100644
index 000000000..83c24ec25
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -0,0 +1,324 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fuse implements fusefs.
+package fuse
+
+import (
+ "strconv"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "fuse"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+type filesystemOptions struct {
+ // userID specifies the numeric uid of the mount owner.
+ // This option should not be specified by the filesystem owner.
+ // It is set by libfuse (or, if libfuse is not used, must be set
+ // by the filesystem itself). For more information, see man page
+ // for fuse(8)
+ userID uint32
+
+ // groupID specifies the numeric gid of the mount owner.
+ // This option should not be specified by the filesystem owner.
+ // It is set by libfuse (or, if libfuse is not used, must be set
+ // by the filesystem itself). For more information, see man page
+ // for fuse(8)
+ groupID uint32
+
+ // rootMode specifies the the file mode of the filesystem's root.
+ rootMode linux.FileMode
+
+ // maxActiveRequests specifies the maximum number of active requests that can
+ // exist at any time. Any further requests will block when trying to
+ // Call the server.
+ maxActiveRequests uint64
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ kernfs.Filesystem
+ devMinor uint32
+
+ // conn is used for communication between the FUSE server
+ // daemon and the sentry fusefs.
+ conn *connection
+
+ // opts is the options the fusefs is initialized with.
+ opts *filesystemOptions
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var fsopts filesystemOptions
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ deviceDescriptorStr, ok := mopts["fd"]
+ if !ok {
+ log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name())
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "fd")
+
+ deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
+ return nil, nil, syserror.EINVAL
+ }
+ fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor))
+
+ // Parse and set all the other supported FUSE mount options.
+ // TODO(gVisor.dev/issue/3229): Expand the supported mount options.
+ if userIDStr, ok := mopts["user_id"]; ok {
+ delete(mopts, "user_id")
+ userID, err := strconv.ParseUint(userIDStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.userID = uint32(userID)
+ }
+
+ if groupIDStr, ok := mopts["group_id"]; ok {
+ delete(mopts, "group_id")
+ groupID, err := strconv.ParseUint(groupIDStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.groupID = uint32(groupID)
+ }
+
+ rootMode := linux.FileMode(0777)
+ modeStr, ok := mopts["rootmode"]
+ if ok {
+ delete(mopts, "rootmode")
+ mode, err := strconv.ParseUint(modeStr, 8, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
+ return nil, nil, syserror.EINVAL
+ }
+ rootMode = linux.FileMode(mode)
+ }
+ fsopts.rootMode = rootMode
+
+ // Set the maxInFlightRequests option.
+ fsopts.maxActiveRequests = maxActiveRequestsDefault
+
+ // Check for unparsed options.
+ if len(mopts) != 0 {
+ log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Create a new FUSE filesystem.
+ fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+ if err != nil {
+ log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
+ return nil, nil, err
+ }
+
+ fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+ // Send a FUSE_INIT request to the FUSE daemon server before returning.
+ // This call is not blocking.
+ if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
+ log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
+ return nil, nil, err
+ }
+
+ // root is the fusefs root directory.
+ root := fs.newInode(creds, fsopts.rootMode)
+
+ return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// NewFUSEFilesystem creates a new FUSE filesystem.
+func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+ fs := &filesystem{
+ devMinor: devMinor,
+ opts: opts,
+ }
+
+ conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+ if err != nil {
+ log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
+ return nil, syserror.EINVAL
+ }
+
+ fs.conn = conn
+ fuseFD := device.Impl().(*DeviceFD)
+ fuseFD.fs = fs
+
+ return fs, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+ fs.Filesystem.Release(ctx)
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoDynamicLookup
+ kernfs.InodeNotSymlink
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.OrderedChildren
+
+ locks vfs.FileLocks
+
+ dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+ i := &inode{}
+ i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+ i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ i.dentry.Init(i)
+
+ return &i.dentry
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ if err != nil {
+ return nil, err
+ }
+ return fd.VFSFileDescription(), nil
+}
+
+// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
+// opts.Sync attribute is ignored since the synchronization is handled by the
+// FUSE server.
+func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
+ var stat linux.Statx
+ stat.Blksize = attr.BlkSize
+ stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor
+
+ rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev)
+ stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor
+
+ if mask&linux.STATX_MODE != 0 {
+ stat.Mode = uint16(attr.Mode)
+ }
+ if mask&linux.STATX_NLINK != 0 {
+ stat.Nlink = attr.Nlink
+ }
+ if mask&linux.STATX_UID != 0 {
+ stat.UID = attr.UID
+ }
+ if mask&linux.STATX_GID != 0 {
+ stat.GID = attr.GID
+ }
+ if mask&linux.STATX_ATIME != 0 {
+ stat.Atime = linux.StatxTimestamp{
+ Sec: int64(attr.Atime),
+ Nsec: attr.AtimeNsec,
+ }
+ }
+ if mask&linux.STATX_MTIME != 0 {
+ stat.Mtime = linux.StatxTimestamp{
+ Sec: int64(attr.Mtime),
+ Nsec: attr.MtimeNsec,
+ }
+ }
+ if mask&linux.STATX_CTIME != 0 {
+ stat.Ctime = linux.StatxTimestamp{
+ Sec: int64(attr.Ctime),
+ Nsec: attr.CtimeNsec,
+ }
+ }
+ if mask&linux.STATX_INO != 0 {
+ stat.Ino = attr.Ino
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ stat.Size = attr.Size
+ }
+ if mask&linux.STATX_BLOCKS != 0 {
+ stat.Blocks = attr.Blocks
+ }
+ return stat
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ fusefs := fs.Impl().(*filesystem)
+ conn := fusefs.conn
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+ if task == nil {
+ log.Warningf("couldn't get kernel task from context")
+ return linux.Statx{}, syserror.EINVAL
+ }
+
+ var in linux.FUSEGetAttrIn
+ // We don't set any attribute in the request, because in VFS2 fstat(2) will
+ // finally be translated into vfs.FilesystemImpl.StatAt() (see
+ // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
+ // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
+ req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ res, err := conn.Call(task, req)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ if err := res.Error(); err != nil {
+ return linux.Statx{}, err
+ }
+
+ var out linux.FUSEGetAttrOut
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return linux.Statx{}, err
+ }
+
+ // Set all metadata into kernfs.InodeAttrs.
+ if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+ }); err != nil {
+ return linux.Statx{}, err
+ }
+
+ return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go
new file mode 100644
index 000000000..779c2bd3f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/init.go
@@ -0,0 +1,166 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// consts used by FUSE_INIT negotiation.
+const (
+ // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMaxMaxPages = 256
+
+ // Maximum value for the time granularity for file time stamps, 1s.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMaxTimeGranNs = 1000000000
+
+ // Minimum value for MaxWrite.
+ // Follow the same behavior as unix fuse implementation.
+ fuseMinMaxWrite = 4096
+
+ // Temporary default value for max readahead, 128kb.
+ fuseDefaultMaxReadahead = 131072
+
+ // The FUSE_INIT_IN flags sent to the daemon.
+ // TODO(gvisor.dev/issue/3199): complete the flags.
+ fuseDefaultInitFlags = linux.FUSE_MAX_PAGES
+)
+
+// Adjustable maximums for Connection's cogestion control parameters.
+// Used as the upperbound of the config values.
+// Currently we do not support adjustment to them.
+var (
+ MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground
+ MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
+)
+
+// InitSend sends a FUSE_INIT request.
+func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
+ in := linux.FUSEInitIn{
+ Major: linux.FUSE_KERNEL_VERSION,
+ Minor: linux.FUSE_KERNEL_MINOR_VERSION,
+ // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this
+ MaxReadahead: fuseDefaultMaxReadahead,
+ Flags: fuseDefaultInitFlags,
+ }
+
+ req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in)
+ if err != nil {
+ return err
+ }
+
+ // Since there is no task to block on and FUSE_INIT is the request
+ // to unblock other requests, use nil.
+ return conn.CallAsync(nil, req)
+}
+
+// InitRecv receives a FUSE_INIT reply and process it.
+func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ var out linux.FUSEInitOut
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ return conn.initProcessReply(&out, hasSysAdminCap)
+}
+
+// Process the FUSE_INIT reply from the FUSE server.
+func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+ // No support for old major fuse versions.
+ if out.Major != linux.FUSE_KERNEL_VERSION {
+ conn.connInitError = true
+
+ // Set the connection as initialized and unblock the blocked requests
+ // (i.e. return error for them).
+ conn.SetInitialized()
+
+ return nil
+ }
+
+ // Start processing the reply.
+ conn.connInitSuccess = true
+ conn.minor = out.Minor
+
+ // No support for limits before minor version 13.
+ if out.Minor >= 13 {
+ conn.bgLock.Lock()
+
+ if out.MaxBackground > 0 {
+ conn.maxBackground = out.MaxBackground
+
+ if !hasSysAdminCap &&
+ conn.maxBackground > MaxUserBackgroundRequest {
+ conn.maxBackground = MaxUserBackgroundRequest
+ }
+ }
+
+ if out.CongestionThreshold > 0 {
+ conn.congestionThreshold = out.CongestionThreshold
+
+ if !hasSysAdminCap &&
+ conn.congestionThreshold > MaxUserCongestionThreshold {
+ conn.congestionThreshold = MaxUserCongestionThreshold
+ }
+ }
+
+ conn.bgLock.Unlock()
+ }
+
+ // No support for the following flags before minor version 6.
+ if out.Minor >= 6 {
+ conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0
+ conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
+ conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
+ conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+ conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
+ conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
+
+ // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
+
+ if out.Flags&linux.FUSE_MAX_PAGES != 0 {
+ maxPages := out.MaxPages
+ if maxPages < 1 {
+ maxPages = 1
+ }
+ if maxPages > fuseMaxMaxPages {
+ maxPages = fuseMaxMaxPages
+ }
+ conn.maxPages = maxPages
+ }
+ }
+
+ // No support for negotiating MaxWrite before minor version 5.
+ if out.Minor >= 5 {
+ conn.maxWrite = out.MaxWrite
+ } else {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+ if conn.maxWrite < fuseMinMaxWrite {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+
+ // Set connection as initialized and unblock the requests
+ // issued before init.
+ conn.SetInitialized()
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go
new file mode 100644
index 000000000..b5b581152
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/register.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers the FUSE device with vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+ if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{
+ GroupName: "misc",
+ }); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// CreateDevtmpfsFile creates a device special file in devtmpfs.
+func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error {
+ if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 67e916525..16787116f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -35,6 +35,7 @@ go_library(
"fstree.go",
"gofer.go",
"handle.go",
+ "host_named_pipe.go",
"p9file.go",
"regular_file.go",
"socket.go",
@@ -47,11 +48,13 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fd",
+ "//pkg/fdnotifier",
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
"//pkg/safemem",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/hostfd",
"//pkg/sentry/kernel",
@@ -71,6 +74,7 @@ go_library(
"//pkg/unet",
"//pkg/usermem",
"//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
@@ -81,5 +85,6 @@ go_test(
deps = [
"//pkg/p9",
"//pkg/sentry/contexttest",
+ "//pkg/sentry/pgalloc",
],
)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index b98218753..2a8011eb4 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -85,14 +85,13 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
d2 := &dentry{
refs: 1, // held by d
fs: d.fs,
+ ino: d.fs.nextSyntheticIno(),
mode: uint32(opts.mode),
uid: uint32(opts.kuid),
gid: uint32(opts.kgid),
blockSize: usermem.PageSize, // arbitrary
- handle: handle{
- fd: -1,
- },
- nlink: uint32(2),
+ hostFD: -1,
+ nlink: uint32(2),
}
switch opts.mode.FileType() {
case linux.S_IFDIR:
@@ -121,7 +120,7 @@ type directoryFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(context.Context) {
}
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
@@ -138,6 +137,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.dirents = ds
}
+ d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
if d.cachedMetadataAuthoritative() {
d.touchAtime(fd.vfsfd.Mount())
}
@@ -183,13 +183,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
{
Name: ".",
Type: linux.DT_DIR,
- Ino: d.ino,
+ Ino: uint64(d.ino),
NextOff: 1,
},
{
Name: "..",
Type: uint8(atomic.LoadUint32(&parent.mode) >> 12),
- Ino: parent.ino,
+ Ino: uint64(parent.ino),
NextOff: 2,
},
}
@@ -203,14 +203,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
off := uint64(0)
const count = 64 * 1024 // for consistency with the vfs1 client
d.handleMu.RLock()
- if !d.handleReadable {
+ if d.readFile.isNil() {
// This should not be possible because a readable handle should
// have been opened when the calling directoryFD was opened.
d.handleMu.RUnlock()
panic("gofer.dentry.getDirents called without a readable handle")
}
for {
- p9ds, err := d.handle.file.readdir(ctx, off, count)
+ p9ds, err := d.readFile.readdir(ctx, off, count)
if err != nil {
d.handleMu.RUnlock()
return nil, err
@@ -225,7 +225,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
}
dirent := vfs.Dirent{
Name: p9d.Name,
- Ino: p9d.QID.Path,
+ Ino: uint64(inoFromPath(p9d.QID.Path)),
NextOff: int64(len(dirents) + 1),
}
// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
@@ -258,7 +258,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
dirents = append(dirents, vfs.Dirent{
Name: child.name,
Type: uint8(atomic.LoadUint32(&child.mode) >> 12),
- Ino: child.ino,
+ Ino: uint64(child.ino),
NextOff: int64(len(dirents) + 1),
})
}
@@ -299,3 +299,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
return 0, syserror.EINVAL
}
}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+ return fd.dentry().syncRemoteFile(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 7f2181216..40fec890a 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -15,7 +15,9 @@
package gofer
import (
+ "math"
"sync"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -53,8 +55,8 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// Sync regular files.
for _, d := range ds {
- err := d.syncSharedHandle(ctx)
- d.DecRef()
+ err := d.syncCachedFile(ctx)
+ d.DecRef(ctx)
if err != nil && retErr == nil {
retErr = err
}
@@ -64,7 +66,7 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// handles (so they won't be synced by the above).
for _, sffd := range sffds {
err := sffd.Sync(ctx)
- sffd.vfsfd.DecRef()
+ sffd.vfsfd.DecRef(ctx)
if err != nil && retErr == nil {
retErr = err
}
@@ -118,7 +120,7 @@ func putDentrySlice(ds *[]*dentry) {
// must be up to date.
//
// Postconditions: The returned dentry's cached metadata is up to date.
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -132,7 +134,7 @@ afterSymlink:
return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
@@ -145,15 +147,13 @@ afterSymlink:
//
// Call rp.CheckMount() before updating d.parent's metadata, since if
// we traverse to another mount then d.parent's metadata is irrelevant.
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, err
}
if d != d.parent && !d.cachedMetadataAuthoritative() {
- _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
- if err != nil {
+ if err := d.parent.updateFromGetattr(ctx); err != nil {
return nil, err
}
- d.parent.updateFromP9Attrs(attrMask, &attr)
}
rp.Advance()
return d.parent, nil
@@ -165,10 +165,10 @@ afterSymlink:
if child == nil {
return nil, syserror.ENOENT
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
- if child.isSymlink() && rp.ShouldFollowSymlink() {
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx, rp.Mount())
if err != nil {
return nil, err
@@ -208,18 +208,28 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
// Preconditions: As for getChildLocked. !parent.isSynthetic().
func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+ if child != nil {
+ // Need to lock child.metadataMu because we might be updating child
+ // metadata. We need to hold the lock *before* getting metadata from the
+ // server and release it after updating local metadata.
+ child.metadataMu.Lock()
+ }
qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
if err != nil && err != syserror.ENOENT {
+ if child != nil {
+ child.metadataMu.Unlock()
+ }
return nil, err
}
if child != nil {
- if !file.isNil() && qid.Path == child.ino {
- // The file at this path hasn't changed. Just update cached
- // metadata.
+ if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+ // The file at this path hasn't changed. Just update cached metadata.
file.close(ctx)
- child.updateFromP9Attrs(attrMask, &attr)
+ child.updateFromP9AttrsLocked(attrMask, &attr)
+ child.metadataMu.Unlock()
return child, nil
}
+ child.metadataMu.Unlock()
if file.isNil() && child.isSynthetic() {
// We have a synthetic file, and no remote file has arisen to
// replace it.
@@ -230,7 +240,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// has 0 references, drop it). Wait to update parent.children until we
// know what to replace the existing dentry with (i.e. one of the
// returns below), to avoid a redundant map access.
- vfsObj.InvalidateDentry(&child.vfsd)
+ vfsObj.InvalidateDentry(ctx, &child.vfsd)
if child.isSynthetic() {
// Normally we don't mark invalidated dentries as deleted since
// they may still exist (but at a different path), and also for
@@ -275,7 +285,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, ds)
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -301,7 +311,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
}
for !rp.Done() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, ds)
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -323,7 +333,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -371,17 +381,33 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
parent.touchCMtime()
parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
return nil
}
if fs.opts.interop == InteropModeShared {
- // The existence of a dentry at name would be inconclusive because the
- // file it represents may have been deleted from the remote filesystem,
- // so we would need to make an RPC to revalidate the dentry. Just
- // attempt the file creation RPC instead. If a file does exist, the RPC
- // will fail with EEXIST like we would have. If the RPC succeeds, and a
- // stale dentry exists, the dentry will fail revalidation next time
- // it's used.
- return createInRemoteDir(parent, name)
+ if child := parent.children[name]; child != nil && child.isSynthetic() {
+ return syserror.EEXIST
+ }
+ // The existence of a non-synthetic dentry at name would be inconclusive
+ // because the file it represents may have been deleted from the remote
+ // filesystem, so we would need to make an RPC to revalidate the dentry.
+ // Just attempt the file creation RPC instead. If a file does exist, the
+ // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+ // stale dentry exists, the dentry will fail revalidation next time it's
+ // used.
+ if err := createInRemoteDir(parent, name); err != nil {
+ return err
+ }
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+ return nil
}
if child := parent.children[name]; child != nil {
return syserror.EEXIST
@@ -397,6 +423,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
parent.touchCMtime()
parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
return nil
}
@@ -404,7 +435,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -440,24 +471,64 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
+
child, ok := parent.children[name]
if ok && child == nil {
return syserror.ENOENT
}
- // We only need a dentry representing the file at name if it can be a mount
- // point. If child is nil, then it can't be a mount point. If child is
- // non-nil but stale, the actual file can't be a mount point either; we
- // detect this case by just speculatively calling PrepareDeleteDentry and
- // only revalidating the dentry if that fails (indicating that the existing
- // dentry is a mount point).
+
+ sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+ if sticky {
+ if !ok {
+ // If the sticky bit is set, we need to retrieve the child to determine
+ // whether removing it is allowed.
+ child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err != nil {
+ return err
+ }
+ } else if child != nil && !child.cachedMetadataAuthoritative() {
+ // Make sure the dentry representing the file at name is up to date
+ // before examining its metadata.
+ child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+ if err != nil {
+ return err
+ }
+ }
+ if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
+ }
+
+ // If a child dentry exists, prepare to delete it. This should fail if it is
+ // a mount point. We detect mount points by speculatively calling
+ // PrepareDeleteDentry, which fails if child is a mount point. However, we
+ // may need to revalidate the file in this case to make sure that it has not
+ // been deleted or replaced on the remote fs, in which case the mount point
+ // will have disappeared. If calling PrepareDeleteDentry fails again on the
+ // up-to-date dentry, we can be sure that it is a mount point.
+ //
+ // Also note that if child is nil, then it can't be a mount point.
if child != nil {
+ // Hold child.dirMu so we can check child.children and
+ // child.syntheticChildren. We don't access these fields until a bit later,
+ // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+ // create an inconsistent lock ordering between dentry.dirMu and
+ // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+ // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+ // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+ // child.dirMu before calling PrepareDeleteDentry.
child.dirMu.Lock()
defer child.dirMu.Unlock()
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
- if parent.cachedMetadataAuthoritative() {
+ // We can skip revalidation in several cases:
+ // - We are not in InteropModeShared
+ // - The parent directory is synthetic, in which case the child must also
+ // be synthetic
+ // - We already updated the child during the sticky bit check above
+ if parent.cachedMetadataAuthoritative() || sticky {
return err
}
child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -518,7 +589,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
if child == nil {
return syserror.ENOENT
}
- } else {
+ } else if child == nil || !child.isSynthetic() {
err = parent.file.unlinkAt(ctx, name, flags)
if err != nil {
if child != nil {
@@ -527,8 +598,20 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
return err
}
}
+
+ // Generate inotify events for rmdir or unlink.
+ if dir {
+ parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+ } else {
+ var cw *vfs.Watches
+ if child != nil {
+ cw = &child.watches
+ }
+ vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
+ }
+
if child != nil {
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
child.setDeleted()
if child.isSynthetic() {
parent.syntheticChildren--
@@ -555,7 +638,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
// but dentry slices are allocated lazily, and it's much easier to say "defer
// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
-func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
fs.renameMu.RUnlock()
if *ds == nil {
return
@@ -563,20 +646,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) {
if len(**ds) != 0 {
fs.renameMu.Lock()
for _, d := range **ds {
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
}
fs.renameMu.Unlock()
}
putDentrySlice(*ds)
}
-func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
if *ds == nil {
fs.renameMu.Unlock()
return
}
for _, d := range **ds {
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
}
fs.renameMu.Unlock()
putDentrySlice(*ds)
@@ -586,7 +669,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
@@ -598,7 +681,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -619,7 +702,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
@@ -642,8 +725,29 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
- // 9P2000.L supports hard links, but we don't.
- return syserror.EPERM
+ d := vd.Dentry().Impl().(*dentry)
+ if d.isDir() {
+ return syserror.EPERM
+ }
+ gid := auth.KGID(atomic.LoadUint32(&d.gid))
+ uid := auth.KUID(atomic.LoadUint32(&d.uid))
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
+ return err
+ }
+ if d.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.nlink == math.MaxUint32 {
+ return syserror.EMLINK
+ }
+ if err := parent.file.link(ctx, d.file, childName); err != nil {
+ return err
+ }
+
+ // Success!
+ atomic.AddUint32(&d.nlink, 1)
+ return nil
}, nil)
}
@@ -730,7 +834,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
@@ -740,6 +844,13 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
}
if rp.Done() {
+ // Reject attempts to open mount root directory with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
return start.openLocked(ctx, rp, &opts)
}
@@ -752,27 +863,33 @@ afterTrailingSymlink:
if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
- child, err := fs.stepLocked(ctx, rp, parent, &ds)
+ child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
if err == syserror.ENOENT && mayCreate {
if parent.isSynthetic() {
parent.dirMu.Unlock()
return nil, syserror.EPERM
}
- fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+ fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
parent.dirMu.Unlock()
return fd, err
}
+ parent.dirMu.Unlock()
if err != nil {
- parent.dirMu.Unlock()
return nil, err
}
- // Open existing child or follow symlink.
- parent.dirMu.Unlock()
if mustCreate {
return nil, syserror.EEXIST
}
+ if !child.isDir() && rp.MustBeDir() {
+ return nil, syserror.ENOTDIR
+ }
+ // Open existing child or follow symlink.
if child.isSymlink() && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx, rp.Mount())
if err != nil {
@@ -793,20 +910,32 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
+
+ trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+ if trunc {
+ // Lock metadataMu *while* we open a regular file with O_TRUNC because
+ // open(2) will change the file size on server.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ }
+
+ var vfd *vfs.FileDescription
+ var err error
mnt := rp.Mount()
switch d.fileType() {
case linux.S_IFREG:
if !d.fs.opts.regularFilesUseSpecialFileFD {
- if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
return nil, err
}
fd := &regularFileFD{}
+ fd.LockFD.Init(&d.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
AllowDirectIO: true,
}); err != nil {
return nil, err
}
- return &fd.vfsfd, nil
+ vfd = &fd.vfsfd
}
case linux.S_IFDIR:
// Can't open directories with O_CREAT.
@@ -826,6 +955,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
}
}
fd := &directoryFD{}
+ fd.LockFD.Init(&d.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -842,10 +972,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
}
case linux.S_IFIFO:
if d.isSynthetic() {
- return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
+ return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+ }
+ }
+
+ if vfd == nil {
+ if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+ return nil, err
}
}
- return d.openSpecialFileLocked(ctx, mnt, opts)
+
+ if trunc {
+ // If no errors occured so far then update file size in memory. This
+ // step is required even if !d.cachedMetadataAuthoritative() because
+ // d.mappings has to be updated.
+ // d.metadataMu has already been acquired if trunc == true.
+ d.updateFileSizeLocked(0)
+
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ }
+ return vfd, err
}
func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
@@ -873,19 +1021,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
- h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
+ // We assume that the server silently inserts O_NONBLOCK in the open flags
+ // for all named pipes (because all existing gofers do this).
+ //
+ // NOTE(b/133875563): This makes named pipe opens racy, because the
+ // mechanisms for translating nonblocking to blocking opens can only detect
+ // the instantaneous presence of a peer holding the other end of the pipe
+ // open, not whether the pipe was *previously* opened by a peer that has
+ // since closed its end.
+ isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+ h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
if err != nil {
+ if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+ // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+ // with ENXIO if opening the same named pipe with O_WRONLY would
+ // block because there are no readers of the pipe.
+ if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+ return nil, err
+ }
+ goto retry
+ }
return nil, err
}
- seekable := d.fileType() == linux.S_IFREG
- fd := &specialFileFD{
- handle: h,
- seekable: seekable,
+ if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+ if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+ h.close(ctx)
+ return nil, err
+ }
}
- if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
- DenyPRead: !seekable,
- DenyPWrite: !seekable,
- }); err != nil {
+ fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+ if err != nil {
h.close(ctx)
return nil, err
}
@@ -894,7 +1060,7 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
// !d.isSynthetic().
-func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
@@ -919,7 +1085,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
// Filter file creation flags and O_LARGEFILE out; the create RPC already
// has the semantics of O_CREAT|O_EXCL, while some servers will choke on
// O_LARGEFILE.
- createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+ createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE))
fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
if err != nil {
dirfile.close(ctx)
@@ -947,22 +1113,25 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
return nil, err
}
+ *ds = appendDentry(*ds, child)
// Incorporate the fid that was opened by lcreate.
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
child.handleMu.Lock()
- child.handle.file = openFile
- if fdobj != nil {
- child.handle.fd = int32(fdobj.Release())
+ if vfs.MayReadFileWithOpenFlags(opts.Flags) {
+ child.readFile = openFile
+ if fdobj != nil {
+ child.hostFD = int32(fdobj.Release())
+ }
+ } else if fdobj != nil {
+ // Can't use fdobj if it's not readable.
+ fdobj.Close()
+ }
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
+ child.writeFile = openFile
}
- child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
- child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
child.handleMu.Unlock()
}
- // Take a reference on the new dentry to be held by the new file
- // description. (This reference also means that the new dentry is not
- // eligible for caching yet, so we don't need to append to a dentry slice.)
- child.refs = 1
// Insert the dentry into the tree.
d.cacheNewChildLocked(child, name)
if d.cachedMetadataAuthoritative() {
@@ -974,6 +1143,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
var childVFSFD *vfs.FileDescription
if useRegularFileFD {
fd := &regularFileFD{}
+ fd.LockFD.Init(&child.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
AllowDirectIO: true,
}); err != nil {
@@ -981,26 +1151,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
childVFSFD = &fd.vfsfd
} else {
- seekable := child.fileType() == linux.S_IFREG
- fd := &specialFileFD{
- handle: handle{
- file: openFile,
- fd: -1,
- },
- seekable: seekable,
+ h := handle{
+ file: openFile,
+ fd: -1,
}
if fdobj != nil {
- fd.handle.fd = int32(fdobj.Release())
+ h.fd = int32(fdobj.Release())
}
- if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
- DenyPRead: !seekable,
- DenyPWrite: !seekable,
- }); err != nil {
- fd.handle.close(ctx)
+ fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+ if err != nil {
+ h.close(ctx)
return nil, err
}
childVFSFD = &fd.vfsfd
}
+ d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
return childVFSFD, nil
}
@@ -1008,7 +1173,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
@@ -1028,7 +1193,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
var ds *[]*dentry
fs.renameMu.Lock()
- defer fs.renameMuUnlockAndCheckCaching(&ds)
+ defer fs.renameMuUnlockAndCheckCaching(ctx, &ds)
newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
if err != nil {
return err
@@ -1052,7 +1217,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return err
}
}
- if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
vfsObj := rp.VirtualFilesystem()
@@ -1067,12 +1233,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if renamed == nil {
return syserror.ENOENT
}
+ if err := oldParent.mayDelete(creds, renamed); err != nil {
+ return err
+ }
if renamed.isDir() {
if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
return syserror.EINVAL
}
if oldParent != newParent {
- if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
return err
}
}
@@ -1083,7 +1252,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
if oldParent != newParent {
- if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
newParent.dirMu.Lock()
@@ -1114,7 +1283,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
return nil
}
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
return err
}
@@ -1139,7 +1308,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
// Update the dentry tree.
- vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
if replaced != nil {
replaced.setDeleted()
if replaced.isSynthetic() {
@@ -1181,10 +1350,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if newParent.cachedMetadataAuthoritative() {
newParent.dirents = nil
newParent.touchCMtime()
- if renamed.isDir() {
+ if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+ // Increase the link count if we did not replace another directory.
newParent.incLinks()
}
}
+ vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
return nil
}
@@ -1197,19 +1368,28 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount())
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statx{}, err
@@ -1226,7 +1406,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statfs{}, err
@@ -1276,7 +1456,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -1302,7 +1482,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -1314,7 +1494,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
@@ -1326,24 +1506,38 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- return d.setxattr(ctx, rp.Credentials(), &opts)
+ if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(&ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- return d.removexattr(ctx, rp.Credentials(), name)
+ if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -1352,3 +1546,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
defer fs.renameMu.RUnlock()
return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
}
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+ return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 6295f6b54..63e589859 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -45,6 +45,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -84,12 +85,6 @@ type filesystem struct {
// devMinor is the filesystem's minor device number. devMinor is immutable.
devMinor uint32
- // uid and gid are the effective KUID and KGID of the filesystem's creator,
- // and are used as the owner and group for files that don't specify one.
- // uid and gid are immutable.
- uid auth.KUID
- gid auth.KGID
-
// renameMu serves two purposes:
//
// - It synchronizes path resolution with renaming initiated by this
@@ -115,6 +110,26 @@ type filesystem struct {
syncMu sync.Mutex
syncableDentries map[*dentry]struct{}
specialFileFDs map[*specialFileFD]struct{}
+
+ // syntheticSeq stores a counter to used to generate unique inodeNumber for
+ // synthetic dentries.
+ syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+ if path&syntheticInoMask != 0 {
+ log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+ }
+ return inodeNumber(path &^ syntheticInoMask)
}
type filesystemOptions struct {
@@ -122,6 +137,8 @@ type filesystemOptions struct {
fd int
aname string
interop InteropMode // derived from the "cache" mount option
+ dfltuid auth.KUID
+ dfltgid auth.KGID
msize uint32
version string
@@ -175,10 +192,14 @@ const (
//
// - File timestamps are based on client clocks. This ensures that users of
// the client observe timestamps that are coherent with their own clocks
- // and consistent with Linux's semantics. However, since it is not always
- // possible for clients to set arbitrary atimes and mtimes, and never
- // possible for clients to set arbitrary ctimes, file timestamp changes are
- // stored in the client only and never sent to the remote filesystem.
+ // and consistent with Linux's semantics (in particular, it is not always
+ // possible for clients to set arbitrary atimes and mtimes depending on the
+ // remote filesystem implementation, and never possible for clients to set
+ // arbitrary ctimes.) If a dentry containing a client-defined atime or
+ // mtime is evicted from cache, client timestamps will be sent to the
+ // remote filesystem on a best-effort basis to attempt to ensure that
+ // timestamps will be preserved when another dentry representing the same
+ // file is instantiated.
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -230,6 +251,15 @@ type InternalFilesystemOptions struct {
OpenSocketsByConnecting bool
}
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+ // uint32(-2) doesn't work in Go.
+ _V9FS_DEFUID = auth.KUID(4294967294)
+ _V9FS_DEFGID = auth.KGID(4294967294)
+)
+
// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
return Name
@@ -315,6 +345,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
+ // Parse the default UID and GID.
+ fsopts.dfltuid = _V9FS_DEFUID
+ if dfltuidstr, ok := mopts["dfltuid"]; ok {
+ delete(mopts, "dfltuid")
+ dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ // In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+ // in the caller's user namespace, but goferfs isn't
+ // application-mountable.
+ fsopts.dfltuid = auth.KUID(dfltuid)
+ }
+ fsopts.dfltgid = _V9FS_DEFGID
+ if dfltgidstr, ok := mopts["dfltgid"]; ok {
+ delete(mopts, "dfltgid")
+ dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.dfltgid = auth.KGID(dfltgid)
+ }
+
// Parse the 9P message size.
fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
if msizestr, ok := mopts["msize"]; ok {
@@ -422,8 +477,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
client: client,
clock: ktime.RealtimeClockFromContext(ctx),
devMinor: devMinor,
- uid: creds.EffectiveKUID,
- gid: creds.EffectiveKGID,
syncableDentries: make(map[*dentry]struct{}),
specialFileFDs: make(map[*specialFileFD]struct{}),
}
@@ -433,7 +486,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
if err != nil {
attachFile.close(ctx)
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, err
}
// Set the root's reference count to 2. One reference is returned to the
@@ -446,17 +499,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
- ctx := context.Background()
+func (fs *filesystem) Release(ctx context.Context) {
mf := fs.mfp.MemoryFile()
fs.syncMu.Lock()
for d := range fs.syncableDentries {
d.handleMu.Lock()
d.dataMu.Lock()
- if d.handleWritable {
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty cached data to the remote file.
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
}
// TODO(jamieliu): Do we need to flushf/fsync d?
@@ -466,9 +518,9 @@ func (fs *filesystem) Release() {
d.dirty.RemoveAll()
d.dataMu.Unlock()
// Close the host fd if one exists.
- if d.handle.fd >= 0 {
- syscall.Close(int(d.handle.fd))
- d.handle.fd = -1
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
}
@@ -510,8 +562,6 @@ type dentry struct {
// filesystem.renameMu.
name string
- // We don't support hard links, so each dentry maps 1:1 to an inode.
-
// file is the unopened p9.File that backs this dentry. file is immutable.
//
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
@@ -553,22 +603,39 @@ type dentry struct {
// returned by the server. dirents is protected by dirMu.
dirents []vfs.Dirent
- // Cached metadata; protected by metadataMu and accessed using atomic
- // memory operations unless otherwise specified.
+ // Cached metadata; protected by metadataMu.
+ // To access:
+ // - In situations where consistency is not required (like stat), these
+ // can be accessed using atomic operations only (without locking).
+ // - Lock metadataMu and can access without atomic operations.
+ // To mutate:
+ // - Lock metadataMu and use atomic operations to update because we might
+ // have atomic readers that don't hold the lock.
metadataMu sync.Mutex
- ino uint64 // immutable
- mode uint32 // type is immutable, perms are mutable
- uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
- gid uint32 // auth.KGID, but ...
- blockSize uint32 // 0 if unknown
+ ino inodeNumber // immutable
+ mode uint32 // type is immutable, perms are mutable
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ blockSize uint32 // 0 if unknown
// Timestamps, all nsecs from the Unix epoch.
atime int64
mtime int64
ctime int64
btime int64
- // File size, protected by both metadataMu and dataMu (i.e. both must be
- // locked to mutate it).
+ // File size, which differs from other metadata in two ways:
+ //
+ // - We make a best-effort attempt to keep it up to date even if
+ // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
+ //
+ // - size is protected by both metadataMu and dataMu (i.e. both must be
+ // locked to mutate it; locking either is sufficient to access it).
size uint64
+ // If this dentry does not represent a synthetic file, deleted is 0, and
+ // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
+ // remote file's timestamps, which should be updated when this dentry is
+ // evicted.
+ atimeDirty uint32
+ mtimeDirty uint32
// nlink counts the number of hard links to this dentry. It's updated and
// accessed using atomic operations. It's not protected by metadataMu like the
@@ -581,30 +648,28 @@ type dentry struct {
// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
mappings memmap.MappingSet
- // If this dentry represents a regular file or directory:
- //
- // - handle is the I/O handle used by all regularFileFDs/directoryFDs
- // representing this dentry.
- //
- // - handleReadable is true if handle is readable.
- //
- // - handleWritable is true if handle is writable.
- //
- // Invariants:
+ // - If this dentry represents a regular file or directory, readFile is the
+ // p9.File used for reads by all regularFileFDs/directoryFDs representing
+ // this dentry.
//
- // - If handleReadable == handleWritable == false, then handle.file == nil
- // (i.e. there is no open handle). Conversely, if handleReadable ||
- // handleWritable == true, then handle.file != nil (i.e. there is an open
- // handle).
+ // - If this dentry represents a regular file, writeFile is the p9.File
+ // used for writes by all regularFileFDs representing this dentry.
//
- // - handleReadable and handleWritable cannot transition from true to false
- // (i.e. handles may not be downgraded).
+ // - If this dentry represents a regular file, hostFD is the host FD used
+ // for memory mappings and I/O (when applicable) in preference to readFile
+ // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
+ // also be writable. If hostFD is -1, no such host FD is available.
//
// These fields are protected by handleMu.
- handleMu sync.RWMutex
- handle handle
- handleReadable bool
- handleWritable bool
+ //
+ // readFile and writeFile may or may not represent the same p9.File. Once
+ // either p9.File transitions from closed (isNil() == true) to open
+ // (isNil() == false), it may be mutated with handleMu locked, but cannot
+ // be closed until the dentry is destroyed.
+ handleMu sync.RWMutex
+ readFile p9file
+ writeFile p9file
+ hostFD int32
dataMu sync.RWMutex
@@ -618,7 +683,7 @@ type dentry struct {
// tracks dirty segments in cache. dirty is protected by dataMu.
dirty fsutil.DirtySet
- // pf implements platform.File for mappings of handle.fd.
+ // pf implements platform.File for mappings of hostFD.
pf dentryPlatformFile
// If this dentry represents a symbolic link, InteropModeShared is not in
@@ -634,6 +699,11 @@ type dentry struct {
// If this dentry represents a synthetic named pipe, pipe is the pipe
// endpoint bound to this file.
pipe *pipe.VFSPipe
+
+ locks vfs.FileLocks
+
+ // Inotify watches for this dentry.
+ watches vfs.Watches
}
// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -670,14 +740,12 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
d := &dentry{
fs: fs,
file: file,
- ino: qid.Path,
+ ino: inoFromPath(qid.Path),
mode: uint32(attr.Mode),
- uid: uint32(fs.uid),
- gid: uint32(fs.gid),
+ uid: uint32(fs.opts.dfltuid),
+ gid: uint32(fs.opts.dfltgid),
blockSize: usermem.PageSize,
- handle: handle{
- fd: -1,
- },
+ hostFD: -1,
}
d.pf.dentry = d
if mask.UID {
@@ -725,8 +793,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool {
// updateFromP9Attrs is called to update d's metadata after an update from the
// remote filesystem.
-func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
- d.metadataMu.Lock()
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
if mask.Mode {
if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
d.metadataMu.Unlock()
@@ -744,10 +812,12 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
if attr.BlockSize != 0 {
atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
}
- if mask.ATime {
+ // Don't override newer client-defined timestamps with old server-defined
+ // ones.
+ if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
}
- if mask.MTime {
+ if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
}
if mask.CTime {
@@ -760,25 +830,33 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
}
if mask.Size {
- d.dataMu.Lock()
- atomic.StoreUint64(&d.size, attr.Size)
- d.dataMu.Unlock()
+ d.updateFileSizeLocked(attr.Size)
}
- d.metadataMu.Unlock()
}
// Preconditions: !d.isSynthetic()
func (d *dentry) updateFromGetattr(ctx context.Context) error {
- // Use d.handle.file, which represents a 9P fid that has been opened, in
- // preference to d.file, which represents a 9P fid that has not. This may
- // be significantly more efficient in some implementations.
+ // Use d.readFile or d.writeFile, which represent 9P fids that have been
+ // opened, in preference to d.file, which represents a 9P fid that has not.
+ // This may be significantly more efficient in some implementations. Prefer
+ // d.writeFile over d.readFile since some filesystem implementations may
+ // update a writable handle's metadata after writes to that handle, without
+ // making metadata updates immediately visible to read-only handles
+ // representing the same file.
var (
file p9file
handleMuRLocked bool
)
+ // d.metadataMu must be locked *before* we getAttr so that we do not end up
+ // updating stale attributes in d.updateFromP9AttrsLocked().
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
d.handleMu.RLock()
- if !d.handle.file.isNil() {
- file = d.handle.file
+ if !d.writeFile.isNil() {
+ file = d.writeFile
+ handleMuRLocked = true
+ } else if !d.readFile.isNil() {
+ file = d.readFile
handleMuRLocked = true
} else {
file = d.file
@@ -791,7 +869,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
if err != nil {
return err
}
- d.updateFromP9Attrs(attrMask, &attr)
+ d.updateFromP9AttrsLocked(attrMask, &attr)
return nil
}
@@ -803,10 +881,18 @@ func (d *dentry) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
stat.Blksize = atomic.LoadUint32(&d.blockSize)
stat.Nlink = atomic.LoadUint32(&d.nlink)
+ if stat.Nlink == 0 {
+ // The remote filesystem doesn't support link count; just make
+ // something up. This is consistent with Linux, where
+ // fs/inode.c:inode_init_always() initializes link count to 1, and
+ // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+ // it's not provided by the remote filesystem.
+ stat.Nlink = 1
+ }
stat.UID = atomic.LoadUint32(&d.uid)
stat.GID = atomic.LoadUint32(&d.gid)
stat.Mode = uint16(atomic.LoadUint32(&d.mode))
- stat.Ino = d.ino
+ stat.Ino = uint64(d.ino)
stat.Size = atomic.LoadUint64(&d.size)
// This is consistent with regularFileFD.Seek(), which treats regular files
// as having no holes.
@@ -819,7 +905,8 @@ func (d *dentry) statTo(stat *linux.Statx) {
stat.DevMinor = d.fs.devMinor
}
-func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+ stat := &opts.Stat
if stat.Mask == 0 {
return nil
}
@@ -827,37 +914,47 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
return syserror.EPERM
}
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
- if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
return err
}
if err := mnt.CheckBeginWrite(); err != nil {
return err
}
defer mnt.EndWrite()
- setLocalAtime := false
- setLocalMtime := false
+
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // Reject attempts to truncate files other than regular files, since
+ // filesystem implementations may return the wrong errno.
+ switch mode.FileType() {
+ case linux.S_IFREG:
+ // ok
+ case linux.S_IFDIR:
+ return syserror.EISDIR
+ default:
+ return syserror.EINVAL
+ }
+ }
+
+ var now int64
if d.cachedMetadataAuthoritative() {
- // Timestamp updates will be handled locally.
- setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
- setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
- stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
-
- // Prepare for truncate.
- if stat.Mask&linux.STATX_SIZE != 0 {
- switch d.mode & linux.S_IFMT {
- case linux.S_IFREG:
- if !setLocalMtime {
- // Truncate updates mtime.
- setLocalMtime = true
- stat.Mtime.Nsec = linux.UTIME_NOW
- }
- case linux.S_IFDIR:
- return syserror.EISDIR
- default:
- return syserror.EINVAL
+ // Truncate updates mtime.
+ if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
+ stat.Mask |= linux.STATX_MTIME
+ stat.Mtime = linux.StatxTimestamp{
+ Nsec: linux.UTIME_NOW,
}
}
+
+ // Use client clocks for timestamps.
+ now = d.fs.clock.Now().Nanoseconds()
+ if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
+ stat.Atime = statxTimestampFromDentry(now)
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
+ stat.Mtime = statxTimestampFromDentry(now)
+ }
}
+
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
if !d.isSynthetic() {
@@ -883,6 +980,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
}); err != nil {
return err
}
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // d.size should be kept up to date, and privatized
+ // copy-on-write mappings of truncated pages need to be
+ // invalidated, even if InteropModeShared is in effect.
+ d.updateFileSizeLocked(stat.Size)
+ }
}
if d.fs.opts.interop == InteropModeShared {
// There's no point to updating d's metadata in this case since
@@ -892,7 +995,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
return nil
}
}
- now := d.fs.clock.Now().Nanoseconds()
if stat.Mask&linux.STATX_MODE != 0 {
atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
}
@@ -902,61 +1004,66 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
if stat.Mask&linux.STATX_GID != 0 {
atomic.StoreUint32(&d.gid, stat.GID)
}
- if setLocalAtime {
- if stat.Atime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.atime, now)
- } else {
- atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
- }
+ // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
+ // if d.cachedMetadataAuthoritative() then we converted stat.Atime and
+ // stat.Mtime to client-local timestamps above, and if
+ // !d.cachedMetadataAuthoritative() then we returned after calling
+ // d.file.setAttr(). For the same reason, now must have been initialized.
+ if stat.Mask&linux.STATX_ATIME != 0 {
+ atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+ atomic.StoreUint32(&d.atimeDirty, 0)
}
- if setLocalMtime {
- if stat.Mtime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.mtime, now)
- } else {
- atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
- }
+ if stat.Mask&linux.STATX_MTIME != 0 {
+ atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+ atomic.StoreUint32(&d.mtimeDirty, 0)
}
atomic.StoreInt64(&d.ctime, now)
- if stat.Mask&linux.STATX_SIZE != 0 {
+ return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateFileSizeLocked(newSize uint64) {
+ d.dataMu.Lock()
+ oldSize := d.size
+ atomic.StoreUint64(&d.size, newSize)
+ // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+ // below. This allows concurrent calls to Read/Translate/etc. These
+ // functions synchronize with truncation by refusing to use cache
+ // contents beyond the new d.size. (We are still holding d.metadataMu,
+ // so we can't race with Write or another truncate.)
+ d.dataMu.Unlock()
+ if d.size < oldSize {
+ oldpgend, _ := usermem.PageRoundUp(oldSize)
+ newpgend, _ := usermem.PageRoundUp(d.size)
+ if oldpgend != newpgend {
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+ // Compare Linux's mm/truncate.c:truncate_setsize() =>
+ // truncate_pagecache() =>
+ // mm/memory.c:unmap_mapping_range(evencows=1).
+ InvalidatePrivate: true,
+ })
+ d.mapsMu.Unlock()
+ }
+ // We are now guaranteed that there are no translations of
+ // truncated pages, and can remove them from the cache. Since
+ // truncated pages have been removed from the remote file, they
+ // should be dropped without being written back.
d.dataMu.Lock()
- oldSize := d.size
- d.size = stat.Size
- // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
- // below. This allows concurrent calls to Read/Translate/etc. These
- // functions synchronize with truncation by refusing to use cache
- // contents beyond the new d.size. (We are still holding d.metadataMu,
- // so we can't race with Write or another truncate.)
+ d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+ d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
d.dataMu.Unlock()
- if d.size < oldSize {
- oldpgend, _ := usermem.PageRoundUp(oldSize)
- newpgend, _ := usermem.PageRoundUp(d.size)
- if oldpgend != newpgend {
- d.mapsMu.Lock()
- d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
- // Compare Linux's mm/truncate.c:truncate_setsize() =>
- // truncate_pagecache() =>
- // mm/memory.c:unmap_mapping_range(evencows=1).
- InvalidatePrivate: true,
- })
- d.mapsMu.Unlock()
- }
- // We are now guaranteed that there are no translations of
- // truncated pages, and can remove them from the cache. Since
- // truncated pages have been removed from the remote file, they
- // should be dropped without being written back.
- d.dataMu.Lock()
- d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
- d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
- d.dataMu.Unlock()
- }
}
- return nil
}
func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+ return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
func dentryUIDFromP9UID(uid p9.UID) uint32 {
if !uid.Ok() {
return uint32(auth.OverflowUID)
@@ -992,10 +1099,10 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
+func (d *dentry) DecRef(ctx context.Context) {
if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
d.fs.renameMu.Lock()
- d.checkCachingLocked()
+ d.checkCachingLocked(ctx)
d.fs.renameMu.Unlock()
} else if refs < 0 {
panic("gofer.dentry.DecRef() called without holding a reference")
@@ -1011,6 +1118,37 @@ func (d *dentry) decRefLocked() {
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ if d.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ d.fs.renameMu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
+ }
+ d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
+ d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+ if atomic.LoadInt64(&d.refs) == 0 {
+ d.fs.renameMu.Lock()
+ d.checkCachingLocked(ctx)
+ d.fs.renameMu.Unlock()
+ }
+}
+
// checkCachingLocked should be called after d's reference count becomes 0 or it
// becomes disowned.
//
@@ -1020,8 +1158,9 @@ func (d *dentry) decRefLocked() {
// operation. One of the calls may destroy the dentry, so subsequent calls will
// do nothing.
//
-// Preconditions: d.fs.renameMu must be locked for writing.
-func (d *dentry) checkCachingLocked() {
+// Preconditions: d.fs.renameMu must be locked for writing; it may be
+// temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context) {
// Dentries with a non-zero reference count must be retained. (The only way
// to obtain a reference on a dentry with zero references is via path
// resolution, which requires renameMu, so if d.refs is zero then it will
@@ -1042,12 +1181,23 @@ func (d *dentry) checkCachingLocked() {
// Deleted and invalidated dentries with zero references are no longer
// reachable by path resolution and should be dropped immediately.
if d.vfsd.IsDead() {
+ if d.isDeleted() {
+ d.watches.HandleDeletion(ctx)
+ }
if d.cached {
d.fs.cachedDentries.Remove(d)
d.fs.cachedDentriesLen--
d.cached = false
}
- d.destroyLocked()
+ d.destroyLocked(ctx)
+ return
+ }
+ // If d still has inotify watches and it is not deleted or invalidated, we
+ // cannot cache it and allow it to be evicted. Otherwise, we will lose its
+ // watches, even if a new dentry is created for the same file in the future.
+ // Note that the size of d.watches cannot concurrently transition from zero
+ // to non-zero, because adding a watch requires holding a reference on d.
+ if d.watches.Size() > 0 {
return
}
// If d is already cached, just move it to the front of the LRU.
@@ -1074,7 +1224,7 @@ func (d *dentry) checkCachingLocked() {
if !victim.vfsd.IsDead() {
// Note that victim can't be a mount point (in any mount
// namespace), since VFS holds references on mount points.
- d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd)
+ d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
delete(victim.parent.children, victim.name)
// We're only deleting the dentry, not the file it
// represents, so we don't need to update
@@ -1082,19 +1232,21 @@ func (d *dentry) checkCachingLocked() {
}
victim.parent.dirMu.Unlock()
}
- victim.destroyLocked()
+ victim.destroyLocked(ctx)
}
// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
// back down to fs.opts.maxCachedDentries, so we don't loop.
}
}
-// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
-// close p9 file and remove reference on parent dentry.
+// destroyLocked destroys the dentry.
//
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
-// not a child dentry.
-func (d *dentry) destroyLocked() {
+// Preconditions:
+// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * d.refs == 0.
+// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
+// from its former parent dentry.
+func (d *dentry) destroyLocked(ctx context.Context) {
switch atomic.LoadInt64(&d.refs) {
case 0:
// Mark the dentry destroyed.
@@ -1105,27 +1257,64 @@ func (d *dentry) destroyLocked() {
panic("dentry.destroyLocked() called with references on the dentry")
}
- ctx := context.Background()
+ // Allow the following to proceed without renameMu locked to improve
+ // scalability.
+ d.fs.renameMu.Unlock()
+
+ mf := d.fs.mfp.MemoryFile()
d.handleMu.Lock()
- if !d.handle.file.isNil() {
- mf := d.fs.mfp.MemoryFile()
- d.dataMu.Lock()
+ d.dataMu.Lock()
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty pages back to the remote filesystem.
- if d.handleWritable {
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
- log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
- }
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
}
- // Discard cached data.
+ }
+ // Discard cached data.
+ if !d.cache.IsEmpty() {
+ mf.MarkAllUnevictable(d)
d.cache.DropAll(mf)
d.dirty.RemoveAll()
- d.dataMu.Unlock()
- // Clunk open fids and close open host FDs.
- d.handle.close(ctx)
+ }
+ d.dataMu.Unlock()
+ // Clunk open fids and close open host FDs.
+ if !d.readFile.isNil() {
+ d.readFile.close(ctx)
+ }
+ if !d.writeFile.isNil() && d.readFile != d.writeFile {
+ d.writeFile.close(ctx)
+ }
+ d.readFile = p9file{}
+ d.writeFile = p9file{}
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
if !d.file.isNil() {
+ if !d.isDeleted() {
+ // Write dirty timestamps back to the remote filesystem.
+ atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
+ mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
+ if atimeDirty || mtimeDirty {
+ atime := atomic.LoadInt64(&d.atime)
+ mtime := atomic.LoadInt64(&d.mtime)
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ ATime: atimeDirty,
+ ATimeNotSystemTime: atimeDirty,
+ MTime: mtimeDirty,
+ MTimeNotSystemTime: mtimeDirty,
+ }, p9.SetAttr{
+ ATimeSeconds: uint64(atime / 1e9),
+ ATimeNanoSeconds: uint64(atime % 1e9),
+ MTimeSeconds: uint64(mtime / 1e9),
+ MTimeNanoSeconds: uint64(mtime % 1e9),
+ }); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
+ }
+ }
+ }
d.file.close(ctx)
d.file = p9file{}
// Remove d from the set of syncable dentries.
@@ -1133,11 +1322,14 @@ func (d *dentry) destroyLocked() {
delete(d.fs.syncableDentries, d)
d.fs.syncMu.Unlock()
}
+
+ d.fs.renameMu.Lock()
+
// Drop the reference held by d on its parent without recursively locking
// d.fs.renameMu.
if d.parent != nil {
if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
- d.parent.checkCachingLocked()
+ d.parent.checkCachingLocked(ctx)
} else if refs < 0 {
panic("gofer.dentry.DecRef() called without holding a reference")
}
@@ -1155,7 +1347,7 @@ func (d *dentry) setDeleted() {
// We only support xattrs prefixed with "user." (see b/148380782). Currently,
// there is no need to expose any other xattrs through a gofer.
func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
- if d.file.isNil() {
+ if d.file.isNil() || !d.userXattrSupported() {
return nil, nil
}
xattrMap, err := d.file.listXattr(ctx, size)
@@ -1181,6 +1373,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
return "", syserror.EOPNOTSUPP
}
+ if !d.userXattrSupported() {
+ return "", syserror.ENODATA
+ }
return d.file.getXattr(ctx, opts.Name, opts.Size)
}
@@ -1194,6 +1389,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
return syserror.EOPNOTSUPP
}
+ if !d.userXattrSupported() {
+ return syserror.EPERM
+ }
return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}
@@ -1207,89 +1405,139 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
return syserror.EOPNOTSUPP
}
+ if !d.userXattrSupported() {
+ return syserror.EPERM
+ }
return d.file.removeXattr(ctx, name)
}
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory().
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+ filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+ return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
// O_TRUNC unconditionally requires us to obtain a new handle (opened with
// O_TRUNC).
if !trunc {
d.handleMu.RLock()
- if (!read || d.handleReadable) && (!write || d.handleWritable) {
- // The current handle is sufficient.
+ if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+ // Current handles are sufficient.
d.handleMu.RUnlock()
return nil
}
d.handleMu.RUnlock()
}
- haveOldFD := false
+ fdToClose := int32(-1)
+ invalidateTranslations := false
d.handleMu.Lock()
- if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
- // Get a new handle.
- wantReadable := d.handleReadable || read
- wantWritable := d.handleWritable || write
- h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+ if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+ // Get a new handle. If this file has been opened for both reading and
+ // writing, try to get a single handle that is usable for both:
+ //
+ // - Writable memory mappings of a host FD require that the host FD is
+ // opened for both reading and writing.
+ //
+ // - NOTE(b/141991141): Some filesystems may not ensure coherence
+ // between multiple handles for the same file.
+ openReadable := !d.readFile.isNil() || read
+ openWritable := !d.writeFile.isNil() || write
+ h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ if err == syserror.EACCES && (openReadable != read || openWritable != write) {
+ // It may not be possible to use a single handle for both
+ // reading and writing, since permissions on the file may have
+ // changed to e.g. disallow reading after previously being
+ // opened for reading. In this case, we have no choice but to
+ // use separate handles for reading and writing.
+ ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
+ openReadable = read
+ openWritable = write
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
if err != nil {
d.handleMu.Unlock()
return err
}
- if !d.handle.file.isNil() {
- // Check that old and new handles are compatible: If the old handle
- // includes a host file descriptor but the new one does not, or
- // vice versa, old and new memory mappings may be incoherent.
- haveOldFD = d.handle.fd >= 0
- haveNewFD := h.fd >= 0
- if haveOldFD != haveNewFD {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
- h.close(ctx)
- return syserror.EIO
- }
- if haveOldFD {
- // We may have raced with callers of d.pf.FD() that are now
- // using the old file descriptor, preventing us from safely
- // closing it. We could handle this by invalidating existing
- // memmap.Translations, but this is expensive. Instead, use
- // dup3 to make the old file descriptor refer to the new file
- // description, then close the new file descriptor (which is no
- // longer needed). Racing callers may use the old or new file
- // description, but this doesn't matter since they refer to the
- // same file (unless d.fs.opts.overlayfsStaleRead is true,
- // which we handle separately).
- if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
+
+ if d.hostFD < 0 && openReadable && h.fd >= 0 {
+ // We have no existing FD; use the new FD for at least reading.
+ d.hostFD = h.fd
+ } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
+ // We have an existing read-only FD, but the file has just been
+ // opened for writing, so we need to start supporting writable memory
+ // mappings. This may race with callers of d.pf.FD() using the existing
+ // FD, so in most cases we need to delay closing the old FD until after
+ // invalidating memmap.Translations that might have observed it.
+ if !openReadable || h.fd < 0 {
+ // We don't have a read/write FD, so we have no FD that can be
+ // used to create writable memory mappings. Switch to using the
+ // internal page cache.
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = -1
+ } else if d.fs.opts.overlayfsStaleRead {
+ // We do have a read/write FD, but it may not be coherent with
+ // the existing read-only FD, so we must switch to mappings of
+ // the new FD in both the application and sentry.
+ if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
h.close(ctx)
return err
}
- syscall.Close(int(h.fd))
- h.fd = d.handle.fd
- if d.fs.opts.overlayfsStaleRead {
- // Replace sentry mappings of the old FD with mappings of
- // the new FD, since the two are not necessarily coherent.
- if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
- h.close(ctx)
- return err
- }
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = h.fd
+ } else {
+ // We do have a read/write FD. To avoid invalidating existing
+ // memmap.Translations (which is expensive), use dup3 to make
+ // the old file descriptor refer to the new file description,
+ // then close the new file descriptor (which is no longer
+ // needed). Racing callers of d.pf.FD() may use the old or new
+ // file description, but this doesn't matter since they refer
+ // to the same file, and any racing mappings must be read-only.
+ if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
+ oldHostFD := d.hostFD
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
+ h.close(ctx)
+ return err
}
- // Clunk the old fid before making the new handle visible (by
- // unlocking d.handleMu).
- d.handle.file.close(ctx)
+ fdToClose = h.fd
}
+ } else {
+ // h.fd is not useful.
+ fdToClose = h.fd
+ }
+
+ // Switch to new fids.
+ var oldReadFile p9file
+ if openReadable {
+ oldReadFile = d.readFile
+ d.readFile = h.file
+ }
+ var oldWriteFile p9file
+ if openWritable {
+ oldWriteFile = d.writeFile
+ d.writeFile = h.file
+ }
+ // NOTE(b/141991141): Clunk old fids before making new fids visible (by
+ // unlocking d.handleMu).
+ if !oldReadFile.isNil() {
+ oldReadFile.close(ctx)
+ }
+ if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+ oldWriteFile.close(ctx)
}
- // Switch to the new handle.
- d.handle = h
- d.handleReadable = wantReadable
- d.handleWritable = wantWritable
}
d.handleMu.Unlock()
- if d.fs.opts.overlayfsStaleRead && haveOldFD {
- // Invalidate application mappings that may be using the old FD; they
+ if invalidateTranslations {
+ // Invalidate application mappings that may be using an old FD; they
// will be replaced with mappings using the new FD after future calls
// to d.Translate(). This requires holding d.mapsMu, which precedes
// d.handleMu in the lock order.
@@ -1297,28 +1545,70 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
d.mappings.InvalidateAll(memmap.InvalidateOpts{})
d.mapsMu.Unlock()
}
+ if fdToClose >= 0 {
+ syscall.Close(int(fdToClose))
+ }
return nil
}
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) readHandleLocked() handle {
+ return handle{
+ file: d.readFile,
+ fd: d.hostFD,
+ }
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) writeHandleLocked() handle {
+ return handle{
+ file: d.writeFile,
+ fd: d.hostFD,
+ }
+}
+
+func (d *dentry) syncRemoteFile(ctx context.Context) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.syncRemoteFileLocked(ctx)
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if d.hostFD >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(d.hostFD))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ if !d.writeFile.isNil() {
+ return d.writeFile.fsync(ctx)
+ }
+ if !d.readFile.isNil() {
+ return d.readFile.fsync(ctx)
+ }
+ return nil
+}
+
// incLinks increments link count.
-//
-// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
func (d *dentry) incLinks() {
- v := atomic.AddUint32(&d.nlink, 1)
- if v < 2 {
- panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
}
+ atomic.AddUint32(&d.nlink, 1)
}
// decLinks decrements link count.
-//
-// Preconditions: d.nlink > 1.
func (d *dentry) decLinks() {
- v := atomic.AddUint32(&d.nlink, ^uint32(0))
- if v == 0 {
- panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
}
+ atomic.AddUint32(&d.nlink, ^uint32(0))
}
// fileDescription is embedded by gofer implementations of
@@ -1326,6 +1616,9 @@ func (d *dentry) decLinks() {
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ lockLogging sync.Once
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -1354,7 +1647,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount())
+ if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+ return err
+ }
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -1369,10 +1668,41 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
- return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+ d := fd.dentry()
+ if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
- return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
+ d := fd.dentry()
+ if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("File lock using gofer file handled internally.")
+ })
+ return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("Range lock using gofer file handled internally.")
+ })
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index adff39490..bfe75dfe4 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -20,10 +20,13 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
)
func TestDestroyIdempotent(t *testing.T) {
+ ctx := contexttest.Context(t)
fs := filesystem{
+ mfp: pgalloc.MemoryFileProviderFromContext(ctx),
syncableDentries: make(map[*dentry]struct{}),
opts: filesystemOptions{
// Test relies on no dentry being held in the cache.
@@ -31,7 +34,6 @@ func TestDestroyIdempotent(t *testing.T) {
},
}
- ctx := contexttest.Context(t)
attr := &p9.Attr{
Mode: p9.ModeRegular,
}
@@ -50,7 +52,9 @@ func TestDestroyIdempotent(t *testing.T) {
}
parent.cacheNewChildLocked(child, "child")
- child.checkCachingLocked()
+ fs.renameMu.Lock()
+ defer fs.renameMu.Unlock()
+ child.checkCachingLocked(ctx)
if got := atomic.LoadInt64(&child.refs); got != -1 {
t.Fatalf("child.refs=%d, want: -1", got)
}
@@ -58,6 +62,6 @@ func TestDestroyIdempotent(t *testing.T) {
if got := atomic.LoadInt64(&parent.refs); got != -1 {
t.Fatalf("parent.refs=%d, want: -1", got)
}
- child.checkCachingLocked()
- child.checkCachingLocked()
+ child.checkCachingLocked(ctx)
+ child.checkCachingLocked(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 724a3f1f7..104157512 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -63,6 +63,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand
}, nil
}
+func (h *handle) isOpen() bool {
+ return !h.file.isNil()
+}
+
func (h *handle) close(ctx context.Context) {
h.file.close(ctx)
h.file = p9file{}
@@ -124,13 +128,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
}
return cp, cperr
}
-
-func (h *handle) sync(ctx context.Context) error {
- if h.fd >= 0 {
- ctx.UninterruptibleSleepStart(false)
- err := syscall.Fsync(int(h.fd))
- ctx.UninterruptibleSleepFinish(false)
- return err
- }
- return h.file.fsync(ctx)
-}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "fmt"
+ "sync"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+ tempPipeMu sync.Mutex
+ tempPipeReadFD int
+ tempPipeWriteFD int
+ tempPipeBuf [1]byte
+)
+
+func init() {
+ var pipeFDs [2]int
+ if err := unix.Pipe(pipeFDs[:]); err != nil {
+ panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+ }
+ tempPipeReadFD = pipeFDs[0]
+ tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+ for {
+ ok, err := nonblockingPipeHasWriter(fd)
+ if err != nil {
+ return err
+ }
+ if ok {
+ return nil
+ }
+ if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+ return err
+ }
+ }
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+ tempPipeMu.Lock()
+ defer tempPipeMu.Unlock()
+ // Copy 1 byte from fd into the temporary pipe.
+ n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+ if err == syserror.EAGAIN {
+ // The pipe represented by fd is empty, but has a writer.
+ return true, nil
+ }
+ if err != nil {
+ return false, err
+ }
+ if n == 0 {
+ // The pipe represented by fd is empty and has no writer.
+ return false, nil
+ }
+ // The pipe represented by fd is non-empty, so it either has, or has
+ // previously had, a writer. Remove the byte copied to the temporary pipe
+ // before returning.
+ if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+ panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+ }
+ return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+ t := time.NewTimer(100 * time.Millisecond)
+ defer t.Stop()
+ cancel := ctx.SleepStart()
+ select {
+ case <-t.C:
+ ctx.SleepFinish(true)
+ return nil
+ case <-cancel:
+ ctx.SleepFinish(false)
+ return syserror.ErrInterrupted
+ }
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 0d10cf7ac..7e1cbf065 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -24,11 +24,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -48,7 +48,7 @@ type regularFileFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
+func (fd *regularFileFD) Release(context.Context) {
}
// OnClose implements vfs.FileDescriptionImpl.OnClose.
@@ -64,7 +64,37 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return d.handle.file.flush(ctx)
+ if d.writeFile.isNil() {
+ return nil
+ }
+ return d.writeFile.flush(ctx)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ d := fd.dentry()
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ d.handleMu.RLock()
+ err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ d.handleMu.RUnlock()
+ if err != nil {
+ return err
+ }
+ d.dataMu.Lock()
+ atomic.StoreUint64(&d.size, size)
+ d.dataMu.Unlock()
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
}
// PRead implements vfs.FileDescriptionImpl.PRead.
@@ -72,14 +102,18 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
if offset < 0 {
return 0, syserror.EINVAL
}
- if opts.Flags != 0 {
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
return 0, syserror.EOPNOTSUPP
}
// Check for reading at EOF before calling into MM (but not under
// InteropModeShared, which makes d.size unreliable).
d := fd.dentry()
- if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+ if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
return 0, io.EOF
}
@@ -120,85 +154,134 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
- if opts.Flags != 0 {
- return 0, syserror.EOPNOTSUPP
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ d := fd.dentry()
+ // If the fd was opened with O_APPEND, make sure the file size is updated.
+ // There is a possible race here if size is modified externally after
+ // metadata cache is updated.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the fd was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
}
limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(limit)
- d := fd.dentry()
- d.metadataMu.Lock()
- defer d.metadataMu.Unlock()
if d.fs.opts.interop != InteropModeShared {
// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
// file_update_time(). This is d.touchCMtime(), but without locking
// d.metadataMu (recursively).
d.touchCMtimeLocked()
}
- if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
- // Write dirty cached pages that will be touched by the write back to
- // the remote file.
- if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
- return 0, err
- }
- // Remove touched pages from the cache.
- pgstart := usermem.PageRoundDown(uint64(offset))
- pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
- if !ok {
- return 0, syserror.EINVAL
- }
- mr := memmap.MappableRange{pgstart, pgend}
- var freed []platform.FileRange
- d.dataMu.Lock()
- cseg := d.cache.LowerBoundSegment(mr.Start)
- for cseg.Ok() && cseg.Start() < mr.End {
- cseg = d.cache.Isolate(cseg, mr)
- freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
- cseg = d.cache.Remove(cseg).NextSegment()
- }
- d.dataMu.Unlock()
- // Invalidate mappings of removed pages.
- d.mapsMu.Lock()
- d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
- d.mapsMu.Unlock()
- // Finally free pages removed from the cache.
- mf := d.fs.mfp.MemoryFile()
- for _, freedFR := range freed {
- mf.DecRef(freedFR)
- }
- }
+
rw := getDentryReadWriter(ctx, d, offset)
+ defer putDentryReadWriter(rw)
+
if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ if err := fd.writeCache(ctx, d, offset, src); err != nil {
+ return 0, offset, err
+ }
+
// Require the write to go to the remote file.
rw.direct = true
}
+
n, err := src.CopyInTo(ctx, rw)
- putDentryReadWriter(rw)
- if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ if err != nil {
+ return n, offset + n, err
+ }
+ if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ // Note that if any of the following fail, then we can't guarantee that
+ // any data was actually written with the semantics of O_DSYNC or
+ // O_SYNC, so we return zero bytes written. Compare Linux's
+ // mm/filemap.c:generic_file_write_iter() =>
+ // include/linux/fs.h:generic_write_sync().
+ //
// Write dirty cached pages touched by the write back to the remote
// file.
if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
- return 0, err
+ return 0, offset, err
}
// Request the remote filesystem to sync the remote file.
- if err := d.handle.file.fsync(ctx); err != nil {
- return 0, err
+ if err := d.syncRemoteFile(ctx); err != nil {
+ return 0, offset, err
}
}
- return n, err
+ return n, offset + n, nil
+}
+
+func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
+ // Write dirty cached pages that will be touched by the write back to
+ // the remote file.
+ if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+ return err
+ }
+
+ // Remove touched pages from the cache.
+ pgstart := usermem.PageRoundDown(uint64(offset))
+ pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+ if !ok {
+ return syserror.EINVAL
+ }
+ mr := memmap.MappableRange{pgstart, pgend}
+ var freed []memmap.FileRange
+
+ d.dataMu.Lock()
+ cseg := d.cache.LowerBoundSegment(mr.Start)
+ for cseg.Ok() && cseg.Start() < mr.End {
+ cseg = d.cache.Isolate(cseg, mr)
+ freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+ cseg = d.cache.Remove(cseg).NextSegment()
+ }
+ d.dataMu.Unlock()
+
+ // Invalidate mappings of removed pages.
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+ d.mapsMu.Unlock()
+
+ // Finally free pages removed from the cache.
+ mf := d.fs.mfp.MemoryFile()
+ for _, freedFR := range freed {
+ mf.DecRef(freedFR)
+ }
+ return nil
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.mu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.mu.Unlock()
return n, err
}
@@ -241,10 +324,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents and makes dentry.size
// unreliable), or if the file was opened O_DIRECT, read directly from
- // dentry.handle without locking dentry.dataMu.
+ // dentry.readHandleLocked() without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+ h := rw.d.readHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
rw.d.handleMu.RUnlock()
rw.off += n
return n, err
@@ -312,7 +396,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
End: gapEnd,
}
optMR := gap.Range()
- err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+ err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.d.cache.Find(rw.off)
if !seg.Ok() {
@@ -327,7 +411,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
} else {
// Read directly from the file.
gapDsts := dsts.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+ n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
done += n
rw.off += n
dsts = dsts.DropFirst64(n)
@@ -359,11 +443,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// If we have a mmappable host FD (which must be used here to ensure
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents), or if the file was
- // opened with O_DIRECT, write directly to dentry.handle without locking
- // dentry.dataMu.
+ // opened with O_DIRECT, write directly to dentry.writeHandleLocked()
+ // without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ h := rw.d.writeHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
rw.off += n
rw.d.dataMu.Lock()
if rw.off > rw.d.size {
@@ -425,7 +510,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// for detecting or avoiding this.
gapMR := gap.Range().Intersect(mr)
gapSrcs := srcs.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+ n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
done += n
rw.off += n
srcs = srcs.DropFirst64(n)
@@ -451,7 +536,7 @@ exitLoop:
if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
Start: start,
End: rw.off,
- }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+ }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
// We have no idea how many bytes were actually flushed.
rw.off = start
done = 0
@@ -469,6 +554,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
// Compute the range of valid bytes (overflow-checked).
@@ -482,22 +568,31 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
return fsutil.SyncDirty(ctx, memmap.MappableRange{
Start: uint64(offset),
End: uint64(end),
- }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+ }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
}
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
+ }
+ fd.off = newOffset
+ return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
switch whence {
case linux.SEEK_SET:
// Use offset as specified.
case linux.SEEK_CUR:
- offset += fd.off
+ offset += fdOffset
case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
// Ensure file size is up to date.
- d := fd.dentry()
- if fd.filesystem().opts.interop == InteropModeShared {
+ if !d.cachedMetadataAuthoritative() {
if err := d.updateFromGetattr(ctx); err != nil {
return 0, err
}
@@ -525,31 +620,28 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
if offset < 0 {
return 0, syserror.EINVAL
}
- fd.off = offset
return offset, nil
}
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *regularFileFD) Sync(ctx context.Context) error {
- return fd.dentry().syncSharedHandle(ctx)
+ return fd.dentry().syncCachedFile(ctx)
}
-func (d *dentry) syncSharedHandle(ctx context.Context) error {
+func (d *dentry) syncCachedFile(ctx context.Context) error {
d.handleMu.RLock()
- if !d.handleWritable {
- d.handleMu.RUnlock()
- return nil
- }
- d.dataMu.Lock()
- // Write dirty cached data to the remote file.
- err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
- d.dataMu.Unlock()
- if err == nil {
- // Sync the remote file.
- err = d.handle.sync(ctx)
+ defer d.handleMu.RUnlock()
+
+ if h := d.writeHandleLocked(); h.isOpen() {
+ d.dataMu.Lock()
+ // Write dirty cached data to the remote file.
+ err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+ d.dataMu.Unlock()
+ if err != nil {
+ return err
+ }
}
- d.handleMu.RUnlock()
- return err
+ return d.syncRemoteFileLocked(ctx)
}
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -573,7 +665,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
return syserror.ENODEV
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
if !haveFD {
return syserror.ENODEV
@@ -594,7 +686,7 @@ func (d *dentry) mayCachePages() bool {
return true
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
return haveFD
}
@@ -652,7 +744,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
// Translate implements memmap.Mappable.Translate.
func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
d.handleMu.RLock()
- if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+ if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
d.handleMu.RUnlock()
mr := optional
if d.fs.opts.limitHostFDTranslation {
@@ -688,7 +780,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
}
mf := d.fs.mfp.MemoryFile()
- cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+ h := d.readHandleLocked()
+ cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
@@ -747,7 +840,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
- // Whether we have a host fd (and consequently what platform.File is
+ // Whether we have a host fd (and consequently what memmap.File is
// mapped) can change across save/restore, so invalidate all translations
// unconditionally.
d.mapsMu.Lock()
@@ -757,9 +850,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Write the cache's contents back to the remote file so that if we have a
// host fd after restore, the remote file's contents are coherent.
mf := d.fs.mfp.MemoryFile()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
return err
}
@@ -774,20 +870,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Evict implements pgalloc.EvictableMemoryUser.Evict.
func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+ mr := memmap.MappableRange{er.Start, er.End}
+ mf := d.fs.mfp.MemoryFile()
d.mapsMu.Lock()
defer d.mapsMu.Unlock()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- mr := memmap.MappableRange{er.Start, er.End}
- mf := d.fs.mfp.MemoryFile()
// Only allow pages that are no longer memory-mapped to be evicted.
for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
mgapMR := mgap.Range().Intersect(mr)
if mgapMR.Length() == 0 {
continue
}
- if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
}
d.cache.Drop(mgapMR, mf)
@@ -795,53 +894,51 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
}
}
-// dentryPlatformFile implements platform.File. It exists solely because dentry
-// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef.
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
//
// dentryPlatformFile is only used when a host FD representing the remote file
-// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
-// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// memory mappings (i.e. !filesystem.opts.forcePageCache).
type dentryPlatformFile struct {
*dentry
- // fdRefs counts references on platform.File offsets. fdRefs is protected
+ // fdRefs counts references on memmap.File offsets. fdRefs is protected
// by dentry.dataMu.
fdRefs fsutil.FrameRefSet
- // If this dentry represents a regular file, and handle.fd >= 0,
- // hostFileMapper caches mappings of handle.fd.
+ // If this dentry represents a regular file, and dentry.hostFD >= 0,
+ // hostFileMapper caches mappings of dentry.hostFD.
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
hostFileMapperInitOnce sync.Once
}
-// IncRef implements platform.File.IncRef.
-func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
d.dataMu.Lock()
d.fdRefs.IncRefAndAccount(fr)
d.dataMu.Unlock()
}
-// DecRef implements platform.File.DecRef.
-func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
d.dataMu.Lock()
d.fdRefs.DecRefAndAccount(fr)
d.dataMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal.
-func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
d.handleMu.RLock()
- bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
- d.handleMu.RUnlock()
- return bs, err
+ defer d.handleMu.RUnlock()
+ return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (d *dentryPlatformFile) FD() int {
d.handleMu.RLock()
- fd := d.handle.fd
- d.handleMu.RUnlock()
- return int(fd)
+ defer d.handleMu.RUnlock()
+ return int(d.hostFD)
}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index d6dbe9092..85d2bee72 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -108,7 +108,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
// We don't need the receiver.
c.CloseRecv()
- c.Release()
+ c.Release(ctx)
return c, nil
}
@@ -136,8 +136,8 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla
}
// Release implements transport.BoundEndpoint.Release.
-func (e *endpoint) Release() {
- e.dentry.DecRef()
+func (e *endpoint) Release(ctx context.Context) {
+ e.dentry.DecRef(ctx)
}
// Passcred implements transport.BoundEndpoint.Passcred.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a464e6a94..a6368fdd0 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -16,20 +16,23 @@ package gofer
import (
"sync"
+ "sync/atomic"
+ "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
)
-// specialFileFD implements vfs.FileDescriptionImpl for files other than
-// regular files, directories, and symlinks: pipes, sockets, etc. It is also
-// used for regular files when filesystemOptions.specialRegularFiles is in
-// effect. specialFileFD differs from regularFileFD by using per-FD handles
-// instead of shared per-dentry handles, and never buffering I/O.
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
type specialFileFD struct {
fileDescription
@@ -40,14 +43,49 @@ type specialFileFD struct {
// file offset is significant, i.e. a regular file. seekable is immutable.
seekable bool
+ // haveQueue is true if this file description represents a file for which
+ // queue may send I/O readiness events. haveQueue is immutable.
+ haveQueue bool
+ queue waiter.Queue
+
// If seekable is true, off is the file offset. off is protected by mu.
mu sync.Mutex
off int64
}
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+ ftype := d.fileType()
+ seekable := ftype == linux.S_IFREG
+ haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
+ fd := &specialFileFD{
+ handle: h,
+ seekable: seekable,
+ haveQueue: haveQueue,
+ }
+ fd.LockFD.Init(locks)
+ if haveQueue {
+ if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+ return nil, err
+ }
+ }
+ if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: !seekable,
+ DenyPWrite: !seekable,
+ }); err != nil {
+ if haveQueue {
+ fdnotifier.RemoveFD(h.fd)
+ }
+ return nil, err
+ }
+ return fd, nil
+}
+
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *specialFileFD) Release() {
- fd.handle.close(context.Background())
+func (fd *specialFileFD) Release(ctx context.Context) {
+ if fd.haveQueue {
+ fdnotifier.RemoveFD(fd.handle.fd)
+ }
+ fd.handle.close(ctx)
fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
fs.syncMu.Lock()
delete(fs.specialFileFDs, fd)
@@ -62,12 +100,44 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
return fd.handle.file.flush(ctx)
}
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ if fd.haveQueue {
+ return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+ }
+ return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ if fd.haveQueue {
+ fd.queue.EventRegister(e, mask)
+ fdnotifier.UpdateFD(fd.handle.fd)
+ return
+ }
+ fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+ if fd.haveQueue {
+ fd.queue.EventUnregister(e)
+ fdnotifier.UpdateFD(fd.handle.fd)
+ return
+ }
+ fd.fileDescription.EventUnregister(e)
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if fd.seekable && offset < 0 {
return 0, syserror.EINVAL
}
- if opts.Flags != 0 {
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
return 0, syserror.EOPNOTSUPP
}
@@ -76,11 +146,14 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
// hold here since specialFileFD doesn't client-cache data. Just buffer the
// read instead.
- if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+ if d := fd.dentry(); d.cachedMetadataAuthoritative() {
d.touchAtime(fd.vfsfd.Mount())
}
buf := make([]byte, dst.NumBytes())
n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ if err == syserror.EAGAIN {
+ err = syserror.ErrWouldBlock
+ }
if n == 0 {
return 0, err
}
@@ -105,32 +178,76 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if fd.seekable && offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
- if opts.Flags != 0 {
- return 0, syserror.EOPNOTSUPP
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ d := fd.dentry()
+ // If the regular file fd was opened with O_APPEND, make sure the file size
+ // is updated. There is a possible race here if size is modified externally
+ // after metadata cache is updated.
+ if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
}
if fd.seekable {
+ // We need to hold the metadataMu *while* writing to a regular file.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the regular file was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
+ }
limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(limit)
}
// Do a buffered write. See rationale in PRead.
- if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
+ if d.cachedMetadataAuthoritative() {
d.touchCMtime()
}
buf := make([]byte, src.NumBytes())
// Don't do partial writes if we get a partial read from src.
if _, err := src.CopyIn(ctx, buf); err != nil {
- return 0, err
+ return 0, offset, err
}
n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
- return int64(n), err
+ if err == syserror.EAGAIN {
+ err = syserror.ErrWouldBlock
+ }
+ finalOff = offset
+ // Update file size for regular files.
+ if fd.seekable {
+ finalOff += int64(n)
+ // d.metadataMu is already locked at this point.
+ if uint64(finalOff) > d.size {
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ atomic.StoreUint64(&d.size, uint64(finalOff))
+ }
+ }
+ return int64(n), finalOff, err
}
// Write implements vfs.FileDescriptionImpl.Write.
@@ -140,8 +257,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
}
fd.mu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.mu.Unlock()
return n, err
}
@@ -153,27 +270,23 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
}
fd.mu.Lock()
defer fd.mu.Unlock()
- switch whence {
- case linux.SEEK_SET:
- // Use offset as given.
- case linux.SEEK_CUR:
- offset += fd.off
- default:
- // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not
- // clear that file size is even meaningful for these files.
- return 0, syserror.EINVAL
- }
- if offset < 0 {
- return 0, syserror.EINVAL
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
}
- fd.off = offset
- return offset, nil
+ fd.off = newOffset
+ return newOffset, nil
}
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *specialFileFD) Sync(ctx context.Context) error {
- if !fd.vfsfd.IsWritable() {
- return nil
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if fd.handle.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(fd.handle.fd))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
}
- return fd.handle.sync(ctx)
+ return fd.handle.file.fsync(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2608e7e1d..2cb8191b9 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -36,20 +36,24 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
}
}
-// Preconditions: fs.interop != InteropModeShared.
+// Preconditions: d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtime(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime {
+ return
+ }
if err := mnt.CheckBeginWrite(); err != nil {
return
}
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
d.metadataMu.Unlock()
mnt.EndWrite()
}
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
@@ -57,18 +61,22 @@ func (d *dentry) touchCtime() {
d.metadataMu.Unlock()
}
-// Preconditions: fs.interop != InteropModeShared. The caller has successfully
-// called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCMtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
d.metadataMu.Unlock()
}
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// locked d.metadataMu.
func (d *dentry) touchCMtimeLocked() {
now := d.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index ca0fe6d2b..bd701bbc7 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -22,17 +22,18 @@ go_library(
"//pkg/context",
"//pkg/fdnotifier",
"//pkg/fspath",
+ "//pkg/iovec",
"//pkg/log",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/hostfd",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/platform",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
index b9082a20f..0135e4428 100644
--- a/pkg/sentry/fsimpl/host/control.go
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (c *scmRights) Release() {
+func (c *scmRights) Release(ctx context.Context) {
for _, fd := range c.fds {
syscall.Close(fd)
}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 18b127521..bd6caba06 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -28,6 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -90,7 +91,9 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
isTTY: opts.IsTTY,
wouldBlock: wouldBlock(uint32(fileType)),
seekable: seekable,
- canMap: canMap(uint32(fileType)),
+ // NOTE(b/38213152): Technically, some obscure char devices can be memory
+ // mapped, but we only allow regular files.
+ canMap: fileType == linux.S_IFREG,
}
i.pf.inode = i
@@ -114,7 +117,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
d.Init(i)
// i.open will take a reference on d.
- defer d.DecRef()
+ defer d.DecRef(ctx)
// For simplicity, fileDescription.offset is set to 0. Technically, we
// should only set to 0 on files that are not seekable (sockets, pipes,
@@ -165,9 +168,9 @@ type filesystem struct {
devMinor uint32
}
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
@@ -182,6 +185,8 @@ type inode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+ locks vfs.FileLocks
+
// When the reference count reaches zero, the host fd is closed.
refs.AtomicRefCount
@@ -254,7 +259,7 @@ func (i *inode) Mode() linux.FileMode {
}
// Stat implements kernfs.Inode.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
if opts.Mask&linux.STATX__RESERVED != 0 {
return linux.Statx{}, syserror.EINVAL
}
@@ -368,7 +373,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
// SetStat implements kernfs.Inode.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
- s := opts.Stat
+ s := &opts.Stat
m := s.Mask
if m == 0 {
@@ -381,7 +386,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
return err
}
- if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
return err
}
@@ -391,6 +396,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
}
}
if m&linux.STATX_SIZE != 0 {
+ if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
+ return syserror.EINVAL
+ }
if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
return err
}
@@ -423,12 +431,12 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
}
// DecRef implements kernfs.Inode.
-func (i *inode) DecRef() {
- i.AtomicRefCount.DecRefWithDestructor(i.Destroy)
+func (i *inode) DecRef(ctx context.Context) {
+ i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy)
}
// Destroy implements kernfs.Inode.
-func (i *inode) Destroy() {
+func (i *inode) Destroy(context.Context) {
if i.wouldBlock {
fdnotifier.RemoveFD(int32(i.hostFD))
}
@@ -454,10 +462,12 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
fileType := s.Mode & linux.FileTypeMask
// Constrain flags to a subset we can handle.
- // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
- flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+ //
+ // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
+ flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
- if fileType == syscall.S_IFSOCK {
+ switch fileType {
+ case syscall.S_IFSOCK:
if i.isTTY {
log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
return nil, syserror.ENOTTY
@@ -468,35 +478,41 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
return nil, err
}
// Currently, we only allow Unix sockets to be imported.
- return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
- }
+ return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
- // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
- // we don't allow importing arbitrary file types without proper support.
- if i.isTTY {
- fd := &TTYFileDescription{
- fileDescription: fileDescription{inode: i},
- termios: linux.DefaultSlaveTermios,
+ case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
+ if i.isTTY {
+ fd := &TTYFileDescription{
+ fileDescription: fileDescription{inode: i},
+ termios: linux.DefaultSlaveTermios,
+ }
+ fd.LockFD.Init(&i.locks)
+ vfsfd := &fd.vfsfd
+ if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return vfsfd, nil
}
+
+ fd := &fileDescription{inode: i}
+ fd.LockFD.Init(&i.locks)
vfsfd := &fd.vfsfd
if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return vfsfd, nil
- }
- fd := &fileDescription{inode: i}
- vfsfd := &fd.vfsfd
- if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
+ default:
+ log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
+ return nil, syserror.EPERM
}
- return vfsfd, nil
}
// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
// cached to reduce indirections and casting. fileDescription does not hold
@@ -521,15 +537,25 @@ func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
}
// Stat implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts)
+func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
}
// Release implements vfs.FileDescriptionImpl.
-func (f *fileDescription) Release() {
+func (f *fileDescription) Release(context.Context) {
// noop
}
+// Allocate implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if !f.inode.seekable {
+ return syserror.ESPIPE
+ }
+
+ // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
+ return syserror.EOPNOTSUPP
+}
+
// PRead implements FileDescriptionImpl.
func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
i := f.inode
@@ -556,7 +582,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
}
return n, err
}
- // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+
f.offsetMu.Lock()
n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
f.offset += n
@@ -565,8 +591,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
}
func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
- // TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
- if flags != 0 {
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if flags&^linux.RWF_HIPRI != 0 {
return 0, syserror.EOPNOTSUPP
}
reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
@@ -577,41 +605,58 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
// PWrite implements FileDescriptionImpl.
func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- i := f.inode
- if !i.seekable {
+ if !f.inode.seekable {
return 0, syserror.ESPIPE
}
- return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+ return f.writeToHostFD(ctx, src, offset, opts.Flags)
}
// Write implements FileDescriptionImpl.
func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
i := f.inode
if !i.seekable {
- n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+ n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
if isBlockError(err) {
err = syserror.ErrWouldBlock
}
return n, err
}
- // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
- // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+
f.offsetMu.Lock()
- n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+ // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
+ // another process modifies the host file between retrieving the file size
+ // and writing to the host fd. This is an unavoidable race condition because
+ // we cannot enforce synchronization on the host.
+ if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.hostFD, &s); err != nil {
+ f.offsetMu.Unlock()
+ return 0, err
+ }
+ f.offset = s.Size
+ }
+ n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
f.offset += n
f.offsetMu.Unlock()
return n, err
}
-func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
- // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+ hostFD := f.inode.hostFD
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
if flags != 0 {
return 0, syserror.EOPNOTSUPP
}
writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
n, err := src.CopyInTo(ctx, writer)
hostfd.PutReadWriterAt(writer)
+ // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
+ if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ if syncErr := unix.Fsync(hostFD); syncErr != nil {
+ return int64(n), syncErr
+ }
+ }
return int64(n), err
}
@@ -682,7 +727,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
// Sync implements FileDescriptionImpl.
func (f *fileDescription) Sync(context.Context) error {
- // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+ // TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
return unix.Fsync(f.inode.hostFD)
}
@@ -712,3 +757,13 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) {
func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 8545a82f0..65d3af38c 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -19,13 +19,12 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
-// inodePlatformFile implements platform.File. It exists solely because inode
-// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+// inodePlatformFile implements memmap.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
//
// inodePlatformFile should only be used if inode.canMap is true.
type inodePlatformFile struct {
@@ -34,7 +33,7 @@ type inodePlatformFile struct {
// fdRefsMu protects fdRefs.
fdRefsMu sync.Mutex
- // fdRefs counts references on platform.File offsets. It is used solely for
+ // fdRefs counts references on memmap.File offsets. It is used solely for
// memory accounting.
fdRefs fsutil.FrameRefSet
@@ -45,32 +44,32 @@ type inodePlatformFile struct {
fileMapperInitOnce sync.Once
}
-// IncRef implements platform.File.IncRef.
+// IncRef implements memmap.File.IncRef.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+func (i *inodePlatformFile) IncRef(fr memmap.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.IncRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
-// DecRef implements platform.File.DecRef.
+// DecRef implements memmap.File.DecRef.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+func (i *inodePlatformFile) DecRef(fr memmap.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.DecRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
-// MapInternal implements platform.File.MapInternal.
+// MapInternal implements memmap.File.MapInternal.
//
// Precondition: i.inode.canMap must be true.
-func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (i *inodePlatformFile) FD() int {
return i.hostFD
}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 38f1fbfba..4979dd0a9 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -47,11 +47,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
return ep, nil
}
-// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
-//
-// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
-const maxSendBufferSize = 8 << 20
-
// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
// transport.Receiver. It is backed by a host fd that was imported at sentry
// startup. This fd is shared with a hostfs inode, which retains ownership of
@@ -114,10 +109,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
if err != nil {
return syserr.FromError(err)
}
- if sndbuf > maxSendBufferSize {
- log.Warningf("Socket send buffer too large: %d", sndbuf)
- return syserr.ErrInvalidEndpointState
- }
c.stype = linux.SockType(stype)
c.sndbuf = int64(sndbuf)
@@ -148,7 +139,7 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
}
// Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -225,7 +216,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
}
// Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -326,8 +317,8 @@ func (c *ConnectedEndpoint) destroyLocked() {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
-func (c *ConnectedEndpoint) Release() {
- c.ref.DecRefWithDestructor(func() {
+func (c *ConnectedEndpoint) Release(ctx context.Context) {
+ c.ref.DecRefWithDestructor(ctx, func(context.Context) {
c.mu.Lock()
c.destroyLocked()
c.mu.Unlock()
@@ -356,8 +347,8 @@ func (e *SCMConnectedEndpoint) Init() error {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
-func (e *SCMConnectedEndpoint) Release() {
- e.ref.DecRefWithDestructor(func() {
+func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
+ e.ref.DecRefWithDestructor(ctx, func(context.Context) {
e.mu.Lock()
if err := syscall.Close(e.fd); err != nil {
log.Warningf("Failed to close host fd %d: %v", err)
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
index 584c247d2..fc0d5fd38 100644
--- a/pkg/sentry/fsimpl/host/socket_iovec.go
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -17,13 +17,10 @@ package host
import (
"syscall"
- "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/iovec"
"gvisor.dev/gvisor/pkg/syserror"
)
-// maxIovs is the maximum number of iovecs to pass to the host.
-var maxIovs = linux.UIO_MAXIOV
-
// copyToMulti copies as many bytes from src to dst as possible.
func copyToMulti(dst [][]byte, src []byte) {
for _, d := range dst {
@@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
}
}
- if iovsRequired > maxIovs {
+ if iovsRequired > iovec.MaxIovs {
// The kernel will reject our call if we pass this many iovs.
// Use a single intermediate buffer instead.
b := make([]byte, stopLen)
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 68af6e5af..d372c60cb 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -66,12 +67,12 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup {
}
// Release implements fs.FileOperations.Release.
-func (t *TTYFileDescription) Release() {
+func (t *TTYFileDescription) Release(ctx context.Context) {
t.mu.Lock()
t.fgProcessGroup = nil
t.mu.Unlock()
- t.fileDescription.Release()
+ t.fileDescription.Release(ctx)
}
// PRead implements vfs.FileDescriptionImpl.
@@ -325,9 +326,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
task := kernel.TaskFromContext(ctx)
if task == nil {
// No task? Linux does not have an analog for this case, but
- // tty_check_change is more of a blacklist of cases than a
- // whitelist, and is surprisingly permissive. Allowing the
- // change seems most appropriate.
+ // tty_check_change only blocks specific cases and is
+ // surprisingly permissive. Allowing the change seems
+ // appropriate.
return nil
}
@@ -377,3 +378,13 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
return kernel.ERESTARTSYS
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 2bc757b1a..412bdb2eb 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -49,16 +49,6 @@ func wouldBlock(fileType uint32) bool {
return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
}
-// canMap returns true if a file with fileType is allowed to be memory mapped.
-// This is ported over from VFS1, but it's probably not the best way for us
-// to check if a file can be memory mapped.
-func canMap(fileType uint32) bool {
- // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
- //
- // TODO(b/38213152): Some obscure character devices can be mapped.
- return fileType == syscall.S_IFREG
-}
-
// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
// If so, they can be transformed into syserror.ErrWouldBlock.
func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index ef34cb28a..3835557fe 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -45,6 +45,7 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/refs",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
"//pkg/sentry/socket/unix/transport",
@@ -69,6 +70,6 @@ go_test(
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1568a9d49..12adf727a 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -38,7 +39,8 @@ type DynamicBytesFile struct {
InodeNotDirectory
InodeNotSymlink
- data vfs.DynamicBytesSource
+ locks vfs.FileLocks
+ data vfs.DynamicBytesSource
}
var _ Inode = (*DynamicBytesFile)(nil)
@@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
// Open implements Inode.Open.
func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &DynamicBytesFD{}
- if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
+ if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
type DynamicBytesFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DynamicBytesFileDescriptionImpl
+ vfs.LockFD
vfsfd vfs.FileDescription
inode Inode
}
// Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+ fd.LockFD.Init(locks)
if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
return err
}
@@ -97,12 +101,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32)
return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
}
-// Read implmenets vfs.FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
}
-// PRead implmenets vfs.FileDescriptionImpl.PRead.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
}
@@ -118,12 +122,12 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DynamicBytesFD) Release() {}
+func (fd *DynamicBytesFD) Release(context.Context) {}
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
- return fd.inode.Stat(fs, opts)
+ return fd.inode.Stat(ctx, fs, opts)
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -131,3 +135,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
// DynamicBytesFiles are immutable.
return syserror.EPERM
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8284e76a7..fcee6200a 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -42,6 +43,7 @@ import (
type GenericDirectoryFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DirectoryFileDescriptionDefaultImpl
+ vfs.LockFD
vfsfd vfs.FileDescription
children *OrderedChildren
@@ -55,9 +57,9 @@ type GenericDirectoryFD struct {
// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
// dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
fd := &GenericDirectoryFD{}
- if err := fd.Init(children, opts); err != nil {
+ if err := fd.Init(children, locks, opts); err != nil {
return nil, err
}
if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
// Init initializes a GenericDirectoryFD. Use it when overriding
// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
// correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
// Can't open directories for writing.
return syserror.EISDIR
}
+ fd.LockFD.Init(locks)
fd.children = children
return nil
}
@@ -109,8 +112,8 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence
return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
}
-// Release implements vfs.FileDecriptionImpl.Release.
-func (fd *GenericDirectoryFD) Release() {}
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release(context.Context) {}
func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
return fd.vfsfd.VirtualDentry().Mount().Filesystem()
@@ -120,7 +123,7 @@ func (fd *GenericDirectoryFD) inode() Inode {
return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
}
-// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
// o.mu when calling cb.
func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fd.mu.Lock()
@@ -129,7 +132,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
opts := vfs.StatOptions{Mask: linux.STATX_INO}
// Handle ".".
if fd.off == 0 {
- stat, err := fd.inode().Stat(fd.filesystem(), opts)
+ stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
@@ -149,7 +152,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
if fd.off == 1 {
vfsd := fd.vfsfd.VirtualDentry().Dentry()
parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
- stat, err := parentInode.Stat(fd.filesystem(), opts)
+ stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
@@ -173,7 +176,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
childIdx := fd.off - 2
for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
inode := it.Dentry.Impl().(*Dentry).inode
- stat, err := inode.Stat(fd.filesystem(), opts)
+ stat, err := inode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
@@ -195,7 +198,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
return err
}
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
fd.mu.Lock()
defer fd.mu.Unlock()
@@ -223,7 +226,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := fd.filesystem()
inode := fd.inode()
- return inode.Stat(fs, opts)
+ return inode.Stat(ctx, fs, opts)
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
@@ -232,3 +235,18 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio
inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
return inode.SetStat(ctx, fd.filesystem(), creds, opts)
}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4a12ae245..d7edb6342 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -35,7 +35,7 @@ import (
// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, syserror.ENOTDIR
@@ -56,13 +56,13 @@ afterSymlink:
return vfsd, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
return vfsd, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, err
}
rp.Advance()
@@ -77,18 +77,18 @@ afterSymlink:
if err != nil {
return nil, err
}
- if err := rp.CheckMount(&next.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
return nil, err
}
// Resolve any symlink at current path component.
- if rp.ShouldFollowSymlink() && next.isSymlink() {
+ if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
if err != nil {
return nil, err
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef()
+ targetVD.DecRef(ctx)
if err != nil {
return nil, err
}
@@ -116,7 +116,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// Cached dentry exists, revalidate.
if !child.inode.Valid(ctx) {
delete(parent.children, name)
- vfsObj.InvalidateDentry(&child.vfsd)
+ vfsObj.InvalidateDentry(ctx, &child.vfsd)
fs.deferDecRef(&child.vfsd) // Reference from Lookup.
child = nil
}
@@ -152,7 +152,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
vfsd := rp.Start()
for !rp.Done() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
if err != nil {
return nil, nil, err
}
@@ -178,7 +178,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
vfsd := rp.Start()
for !rp.Final() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
if err != nil {
return nil, nil, err
}
@@ -234,7 +234,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *Filesystem) Release(context.Context) {
}
// Sync implements vfs.FilesystemImpl.Sync.
@@ -246,7 +246,7 @@ func (fs *Filesystem) Sync(ctx context.Context) error {
// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
@@ -259,7 +259,7 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
@@ -282,7 +282,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
@@ -300,7 +300,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -337,7 +337,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -365,7 +365,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -397,7 +397,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- defer fs.processDeferredDecRefs()
+ defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
@@ -429,7 +429,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
afterTrailingSymlink:
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return nil, err
}
@@ -449,7 +449,7 @@ afterTrailingSymlink:
return nil, syserror.ENAMETOOLONG
}
// Determine whether or not we need to create a file.
- childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+ childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
if err == syserror.ENOENT {
// Already checked for searchability above; now check for writability.
if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
@@ -483,7 +483,7 @@ afterTrailingSymlink:
}
if targetVD.Ok() {
err := rp.HandleJump(targetVD)
- targetVD.DecRef()
+ targetVD.DecRef(ctx)
if err != nil {
return nil, err
}
@@ -507,7 +507,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
fs.mu.RLock()
d, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return "", err
}
@@ -526,7 +526,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
fs.mu.Lock()
- defer fs.processDeferredDecRefsLocked()
+ defer fs.processDeferredDecRefsLocked(ctx)
defer fs.mu.Unlock()
// Resolve the destination directory first to verify that it's on this
@@ -584,7 +584,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
virtfs := rp.VirtualFilesystem()
// We can't deadlock here due to lock ordering because we're protected from
@@ -615,7 +615,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
dstDir.children = make(map[string]*Dentry)
}
dstDir.children[pc] = src
- virtfs.CommitRenameReplaceDentry(srcVFSD, replaced)
+ virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
return nil
}
@@ -624,7 +624,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -648,7 +648,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer parentDentry.dirMu.Unlock()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
@@ -656,7 +656,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
virtfs.AbortDeleteDentry(vfsd)
return err
}
- virtfs.CommitDeleteDentry(vfsd)
+ virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
@@ -665,7 +665,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
@@ -680,11 +680,11 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statx{}, err
}
- return inode.Stat(fs.VFSFilesystem(), opts)
+ return inode.Stat(ctx, fs.VFSFilesystem(), opts)
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
@@ -692,7 +692,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statfs{}, err
}
@@ -708,7 +708,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -733,7 +733,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, _, err := fs.walkExistingLocked(ctx, rp)
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
@@ -753,7 +753,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
@@ -761,7 +761,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
virtfs.AbortDeleteDentry(vfsd)
return err
}
- virtfs.CommitDeleteDentry(vfsd)
+ virtfs.CommitDeleteDentry(ctx, vfsd)
return nil
}
@@ -770,7 +770,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return nil, err
}
@@ -785,7 +785,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return nil, err
}
@@ -798,7 +798,7 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return "", err
}
@@ -811,7 +811,7 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
@@ -824,7 +824,7 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
- fs.processDeferredDecRefs()
+ fs.processDeferredDecRefs(ctx)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 982daa2e6..c3efcf3ec 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -40,7 +40,7 @@ func (InodeNoopRefCount) IncRef() {
}
// DecRef implements Inode.DecRef.
-func (InodeNoopRefCount) DecRef() {
+func (InodeNoopRefCount) DecRef(context.Context) {
}
// TryIncRef implements Inode.TryIncRef.
@@ -49,7 +49,7 @@ func (InodeNoopRefCount) TryIncRef() bool {
}
// Destroy implements Inode.Destroy.
-func (InodeNoopRefCount) Destroy() {
+func (InodeNoopRefCount) Destroy(context.Context) {
}
// InodeDirectoryNoNewChildren partially implements the Inode interface.
@@ -243,7 +243,7 @@ func (a *InodeAttrs) Mode() linux.FileMode {
// Stat partially implements Inode.Stat. Note that this function doesn't provide
// all the stat fields, and the embedder should consider extending the result
// with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
stat.DevMajor = a.devMajor
@@ -267,7 +267,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
return syserror.EPERM
}
- if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
}
@@ -293,6 +293,8 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
// inode numbers are immutable after node creation.
// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+ // Also, STATX_SIZE will need some special handling, because read-only static
+ // files should return EIO for truncate operations.
return nil
}
@@ -364,12 +366,12 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
}
// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef() {
- o.AtomicRefCount.DecRefWithDestructor(o.Destroy)
+func (o *OrderedChildren) DecRef(ctx context.Context) {
+ o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy)
}
// Destroy cleans up resources referenced by this OrderedChildren.
-func (o *OrderedChildren) Destroy() {
+func (o *OrderedChildren) Destroy(context.Context) {
o.mu.Lock()
defer o.mu.Unlock()
o.order.Reset()
@@ -469,6 +471,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
if err := o.checkExistingLocked(name, child); err != nil {
return err
}
+
+ // TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
o.removeLocked(name)
return nil
}
@@ -516,6 +520,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
if err := o.checkExistingLocked(oldname, child); err != nil {
return nil, err
}
+
+ // TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
replaced := dst.replaceChildLocked(newname, child)
return replaced, nil
}
@@ -555,6 +561,8 @@ type StaticDirectory struct {
InodeAttrs
InodeNoDynamicLookup
OrderedChildren
+
+ locks vfs.FileLocks
}
var _ Inode = (*StaticDirectory)(nil)
@@ -584,7 +592,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
// Open implements kernfs.Inode.
func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+ fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..080118841 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -116,17 +116,17 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
// droppedDentries list. See comment on Filesystem.mu.
-func (fs *Filesystem) processDeferredDecRefs() {
+func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
fs.mu.Lock()
- fs.processDeferredDecRefsLocked()
+ fs.processDeferredDecRefsLocked(ctx)
fs.mu.Unlock()
}
// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked() {
+func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
fs.droppedDentriesMu.Lock()
for _, d := range fs.droppedDentries {
- d.DecRef()
+ d.DecRef(ctx)
}
fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
fs.droppedDentriesMu.Unlock()
@@ -212,22 +212,37 @@ func (d *Dentry) isSymlink() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef() {
- d.AtomicRefCount.DecRefWithDestructor(d.destroy)
+func (d *Dentry) DecRef(ctx context.Context) {
+ d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
}
// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy() {
- d.inode.DecRef() // IncRef from Init.
+func (d *Dentry) destroy(ctx context.Context) {
+ d.inode.DecRef(ctx) // IncRef from Init.
d.inode = nil
if d.parent != nil {
- d.parent.DecRef() // IncRef from Dentry.InsertChild.
+ d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *Dentry) Watches() *vfs.Watches {
+ return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches(context.Context) {}
+
// InsertChild inserts child into the vfs dentry cache with the given name under
// this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
// updates the link count on d if required.
//
// Precondition: d must represent a directory inode.
@@ -311,12 +326,12 @@ type Inode interface {
type inodeRefs interface {
IncRef()
- DecRef()
+ DecRef(ctx context.Context)
TryIncRef() bool
// Destroy is called when the inode reaches zero references. Destroy release
// all resources (references) on objects referenced by the inode, including
// any child dentries.
- Destroy()
+ Destroy(ctx context.Context)
}
type inodeMetadata interface {
@@ -331,7 +346,7 @@ type inodeMetadata interface {
// Stat returns the metadata for this inode. This corresponds to
// vfs.FilesystemImpl.StatAt.
- Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+ Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
// SetStat updates the metadata for this inode. This corresponds to
// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
@@ -413,10 +428,10 @@ type inodeDynamicLookup interface {
// IterDirents is used to iterate over dynamically created entries. It invokes
// cb on each entry in the directory represented by the FileDescription.
// 'offset' is the offset for the entire IterDirents call, which may include
- // results from the caller. 'relOffset' is the offset inside the entries
- // returned by this IterDirents invocation. In other words,
- // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
- // while 'relOffset' is the place where iteration should start from.
+ // results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+ // inside the entries returned by this IterDirents invocation. In other words,
+ // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+ // the return value, while 'relOffset' is the place to start iteration.
IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 412cf6ac9..c5d5afedf 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -46,7 +46,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
v := &vfs.VirtualFilesystem{}
- if err := v.Init(); err != nil {
+ if err := v.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
@@ -100,8 +100,10 @@ type readonlyDir struct {
kernfs.InodeNotSymlink
kernfs.InodeNoDynamicLookup
kernfs.InodeDirectoryNoNewChildren
-
kernfs.OrderedChildren
+
+ locks vfs.FileLocks
+
dentry kernfs.Dentry
}
@@ -117,7 +119,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
}
func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
if err != nil {
return nil, err
}
@@ -128,10 +130,12 @@ type dir struct {
attrs
kernfs.InodeNotSymlink
kernfs.InodeNoDynamicLookup
+ kernfs.OrderedChildren
+
+ locks vfs.FileLocks
fs *filesystem
dentry kernfs.Dentry
- kernfs.OrderedChildren
}
func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
@@ -147,7 +151,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
}
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
if err != nil {
return nil, err
}
@@ -159,7 +163,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*
dir := d.fs.newDir(creds, opts.Mode, nil)
dirVFSD := dir.VFSDentry()
if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
- dir.DecRef()
+ dir.DecRef(ctx)
return nil, err
}
d.IncLinks(1)
@@ -171,7 +175,7 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*
f := d.fs.newFile(creds, "")
fVFSD := f.VFSDentry()
if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
- f.DecRef()
+ f.DecRef(ctx)
return nil, err
}
return fVFSD, nil
@@ -209,7 +213,7 @@ func TestBasic(t *testing.T) {
})
})
defer sys.Destroy()
- sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
+ sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx)
}
func TestMkdirGetDentry(t *testing.T) {
@@ -224,7 +228,7 @@ func TestMkdirGetDentry(t *testing.T) {
if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
}
- sys.GetDentryOrDie(pop).DecRef()
+ sys.GetDentryOrDie(pop).DecRef(sys.Ctx)
}
func TestReadStaticFile(t *testing.T) {
@@ -242,7 +246,7 @@ func TestReadStaticFile(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(sys.Ctx)
content, err := sys.ReadToEnd(fd)
if err != nil {
@@ -269,7 +273,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
}
// Close the file. The file should persist.
- fd.DecRef()
+ fd.DecRef(sys.Ctx)
fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
Flags: linux.O_RDONLY,
@@ -277,7 +281,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
}
- fd.DecRef()
+ fd.DecRef(sys.Ctx)
}
func TestDirFDReadWrite(t *testing.T) {
@@ -293,7 +297,7 @@ func TestDirFDReadWrite(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(sys.Ctx)
// Read/Write should fail for directory FDs.
if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
new file mode 100644
index 000000000..8cf5b35d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+ name = "fstree",
+ out = "fstree.go",
+ package = "overlay",
+ prefix = "generic",
+ template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+ types = {
+ "Dentry": "dentry",
+ },
+)
+
+go_library(
+ name = "overlay",
+ srcs = [
+ "copy_up.go",
+ "directory.go",
+ "filesystem.go",
+ "fstree.go",
+ "non_directory.go",
+ "overlay.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
new file mode 100644
index 000000000..b3d19ff82
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -0,0 +1,262 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "fmt"
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isCopiedUp() bool {
+ return atomic.LoadUint32(&d.copiedUp) != 0
+}
+
+// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok().
+//
+// Preconditions: filesystem.renameMu must be locked.
+func (d *dentry) copyUpLocked(ctx context.Context) error {
+ // Fast path.
+ if d.isCopiedUp() {
+ return nil
+ }
+
+ ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+ switch ftype {
+ case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
+ // Can be copied-up.
+ default:
+ // Can't be copied-up.
+ return syserror.EPERM
+ }
+
+ // Ensure that our parent directory is copied-up.
+ if d.parent == nil {
+ // d is a filesystem root with no upper layer.
+ return syserror.EROFS
+ }
+ if err := d.parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ if d.upperVD.Ok() {
+ // Raced with another call to d.copyUpLocked().
+ return nil
+ }
+ if d.vfsd.IsDead() {
+ // Raced with deletion of d.
+ return syserror.ENOENT
+ }
+
+ // Perform copy-up.
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ newpop := vfs.PathOperation{
+ Root: d.parent.upperVD,
+ Start: d.parent.upperVD,
+ Path: fspath.Parse(d.name),
+ }
+ cleanupUndoCopyUp := func() {
+ var err error
+ if ftype == linux.S_IFDIR {
+ err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop)
+ } else {
+ err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
+ }
+ if err != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+ }
+ }
+ switch ftype {
+ case linux.S_IFREG:
+ oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVDs[0],
+ Start: d.lowerVDs[0],
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err != nil {
+ return err
+ }
+ defer oldFD.DecRef(ctx)
+ newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ })
+ if err != nil {
+ return err
+ }
+ defer newFD.DecRef(ctx)
+ bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+ for {
+ readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
+ if readErr != nil && readErr != io.EOF {
+ cleanupUndoCopyUp()
+ return readErr
+ }
+ total := int64(0)
+ for total < readN {
+ writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
+ total += writeN
+ if writeErr != nil {
+ cleanupUndoCopyUp()
+ return writeErr
+ }
+ }
+ if readErr == io.EOF {
+ break
+ }
+ }
+ if err := newFD.SetStat(ctx, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: d.uid,
+ GID: d.gid,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = newFD.VirtualDentry()
+ d.upperVD.IncRef()
+
+ case linux.S_IFDIR:
+ if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
+ Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ }); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: d.uid,
+ GID: d.gid,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ case linux.S_IFLNK:
+ target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVDs[0],
+ Start: d.lowerVDs[0],
+ })
+ if err != nil {
+ return err
+ }
+ if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
+ Mode: uint16(d.mode),
+ UID: d.uid,
+ GID: d.gid,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ case linux.S_IFBLK, linux.S_IFCHR:
+ lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVDs[0],
+ Start: d.lowerVDs[0],
+ }, &vfs.StatOptions{})
+ if err != nil {
+ return err
+ }
+ if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
+ Mode: linux.FileMode(d.mode),
+ DevMajor: lowerStat.RdevMajor,
+ DevMinor: lowerStat.RdevMinor,
+ }); err != nil {
+ return err
+ }
+ if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: d.uid,
+ GID: d.gid,
+ },
+ }); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+ if err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ d.upperVD = upperVD
+
+ default:
+ // Should have rejected this at the beginning of this function?
+ panic(fmt.Sprintf("unexpected file type %o", ftype))
+ }
+
+ // TODO(gvisor.dev/issue/1199): copy up xattrs
+
+ // Update the dentry's device and inode numbers (except for directories,
+ // for which these remain overlay-assigned).
+ if ftype != linux.S_IFDIR {
+ upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_INO,
+ })
+ if err != nil {
+ d.upperVD.DecRef(ctx)
+ d.upperVD = vfs.VirtualDentry{}
+ cleanupUndoCopyUp()
+ return err
+ }
+ if upperStat.Mask&linux.STATX_INO == 0 {
+ d.upperVD.DecRef(ctx)
+ d.upperVD = vfs.VirtualDentry{}
+ cleanupUndoCopyUp()
+ return syserror.EREMOTE
+ }
+ atomic.StoreUint32(&d.devMajor, upperStat.DevMajor)
+ atomic.StoreUint32(&d.devMinor, upperStat.DevMinor)
+ atomic.StoreUint64(&d.ino, upperStat.Ino)
+ }
+
+ atomic.StoreUint32(&d.copiedUp, 1)
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
new file mode 100644
index 000000000..6a79f7ffe
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -0,0 +1,289 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ var readdirErr error
+ whiteouts := make(map[string]bool)
+ var maybeWhiteouts []string
+ d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+ layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ defer layerFD.DecRef(ctx)
+
+ // Reuse slice allocated for maybeWhiteouts from a previous layer to
+ // reduce allocations.
+ maybeWhiteouts = maybeWhiteouts[:0]
+ err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name == "." || dirent.Name == ".." {
+ return nil
+ }
+ if _, ok := whiteouts[dirent.Name]; ok {
+ // This file has been whited-out in a previous layer.
+ return nil
+ }
+ if dirent.Type == linux.DT_CHR {
+ // We have to determine if this is a whiteout, which doesn't
+ // count against the directory's emptiness. However, we can't
+ // do so while holding locks held by layerFD.IterDirents().
+ maybeWhiteouts = append(maybeWhiteouts, dirent.Name)
+ return nil
+ }
+ // Non-whiteout file in the directory prevents rmdir.
+ return syserror.ENOTEMPTY
+ }))
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+
+ for _, maybeWhiteoutName := range maybeWhiteouts {
+ stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ Path: fspath.Parse(maybeWhiteoutName),
+ }, &vfs.StatOptions{})
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ if stat.RdevMajor != 0 || stat.RdevMinor != 0 {
+ // This file is a real character device, not a whiteout.
+ readdirErr = syserror.ENOTEMPTY
+ return false
+ }
+ whiteouts[maybeWhiteoutName] = isUpper
+ }
+ // Continue iteration since we haven't found any non-whiteout files in
+ // this directory yet.
+ return true
+ })
+ return whiteouts, readdirErr
+}
+
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+
+ mu sync.Mutex
+ off int64
+ dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release(ctx context.Context) {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ d := fd.dentry()
+ if fd.dirents == nil {
+ ds, err := d.getDirents(ctx)
+ if err != nil {
+ return err
+ }
+ fd.dirents = ds
+ }
+
+ for fd.off < int64(len(fd.dirents)) {
+ if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+ return err
+ }
+ fd.off++
+ }
+ return nil
+}
+
+// Preconditions: d.isDir().
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+ d.fs.renameMu.RLock()
+ defer d.fs.renameMu.RUnlock()
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+
+ if d.dirents != nil {
+ return d.dirents, nil
+ }
+
+ parent := genericParentOrSelf(d)
+ dirents := []vfs.Dirent{
+ {
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: d.ino,
+ NextOff: 1,
+ },
+ {
+ Name: "..",
+ Type: uint8(atomic.LoadUint32(&parent.mode) >> 12),
+ Ino: parent.ino,
+ NextOff: 2,
+ },
+ }
+
+ // Merge dirents from all layers comprising this directory.
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ var readdirErr error
+ prevDirents := make(map[string]struct{})
+ var maybeWhiteouts []vfs.Dirent
+ d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+ layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ defer layerFD.DecRef(ctx)
+
+ // Reuse slice allocated for maybeWhiteouts from a previous layer to
+ // reduce allocations.
+ maybeWhiteouts = maybeWhiteouts[:0]
+ err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name == "." || dirent.Name == ".." {
+ return nil
+ }
+ if _, ok := prevDirents[dirent.Name]; ok {
+ // This file is hidden by, or merged with, another file with
+ // the same name in a previous layer.
+ return nil
+ }
+ prevDirents[dirent.Name] = struct{}{}
+ if dirent.Type == linux.DT_CHR {
+ // We can't determine if this file is a whiteout while holding
+ // locks held by layerFD.IterDirents().
+ maybeWhiteouts = append(maybeWhiteouts, dirent)
+ return nil
+ }
+ dirent.NextOff = int64(len(dirents) + 1)
+ dirents = append(dirents, dirent)
+ return nil
+ }))
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+
+ for _, dirent := range maybeWhiteouts {
+ stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ Path: fspath.Parse(dirent.Name),
+ }, &vfs.StatOptions{})
+ if err != nil {
+ readdirErr = err
+ return false
+ }
+ if stat.RdevMajor == 0 && stat.RdevMinor == 0 {
+ // This file is a whiteout; don't emit a dirent for it.
+ continue
+ }
+ dirent.NextOff = int64(len(dirents) + 1)
+ dirents = append(dirents, dirent)
+ }
+ return true
+ })
+ if readdirErr != nil {
+ return nil, readdirErr
+ }
+
+ // Cache dirents for future directoryFDs.
+ d.dirents = dirents
+ return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if offset == 0 {
+ // Ensure that the next call to fd.IterDirents() calls
+ // fd.dentry().getDirents().
+ fd.dirents = nil
+ }
+ fd.off = offset
+ return fd.off, nil
+ case linux.SEEK_CUR:
+ offset += fd.off
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ // Don't clear fd.dirents in this case, even if offset == 0.
+ fd.off = offset
+ return fd.off, nil
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper
+// layer, if there is one. The lower layer doesn't need to sync because it
+// never changes.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+ d := fd.dentry()
+ if !d.isCopiedUp() {
+ return nil
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }
+ upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+ if err != nil {
+ return err
+ }
+ err = upperFD.Sync(ctx)
+ upperFD.DecRef(ctx)
+ return err
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
new file mode 100644
index 000000000..986b36ead
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -0,0 +1,1364 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
+// opaque directories.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
+const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+
+func isWhiteout(stat *linux.Statx) bool {
+ return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ if fs.opts.UpperRoot.Ok() {
+ return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
+ }
+ return nil
+}
+
+var dentrySlicePool = sync.Pool{
+ New: func() interface{} {
+ ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+ return &ds
+ },
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+ if ds == nil {
+ ds = dentrySlicePool.Get().(*[]*dentry)
+ }
+ *ds = append(*ds, d)
+ return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+ // Allow dentries to be GC'd.
+ for i := range *ds {
+ (*ds)[i] = nil
+ }
+ *ds = (*ds)[:0]
+ dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ fs.renameMu.RUnlock()
+ if *ds == nil {
+ return
+ }
+ if len(**ds) != 0 {
+ fs.renameMu.Lock()
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ }
+ putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+ if *ds == nil {
+ fs.renameMu.Unlock()
+ return
+ }
+ for _, d := range **ds {
+ d.checkDropLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if child, ok := parent.children[name]; ok {
+ return child, nil
+ }
+ child, err := fs.lookupLocked(ctx, parent, name)
+ if err != nil {
+ return nil, err
+ }
+ if parent.children == nil {
+ parent.children = make(map[string]*dentry)
+ }
+ parent.children[name] = child
+ // child's refcount is initially 0, so it may be dropped after traversal.
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+ childPath := fspath.Parse(name)
+ child := fs.newDentry()
+ existsOnAnyLayer := false
+ var lookupErr error
+
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+ childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parentVD,
+ Start: parentVD,
+ Path: childPath,
+ }, &vfs.GetDentryOptions{})
+ if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+ // The file doesn't exist on this layer. Proceed to the next one.
+ return true
+ }
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+
+ mask := uint32(linux.STATX_TYPE)
+ if !existsOnAnyLayer {
+ // Mode, UID, GID, and (for non-directories) inode number come from
+ // the topmost layer on which the file exists.
+ mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ }
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+ if stat.Mask&mask != mask {
+ lookupErr = syserror.EREMOTE
+ return false
+ }
+
+ if isWhiteout(&stat) {
+ // This is a whiteout, so it "doesn't exist" on this layer, and
+ // layers below this one are ignored.
+ return false
+ }
+ isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
+ if existsOnAnyLayer && !isDir {
+ // Directories are not merged with non-directory files from lower
+ // layers; instead, layers including and below the first
+ // non-directory file are ignored. (This file must be a directory
+ // on previous layers, since lower layers aren't searched for
+ // non-directory files.)
+ return false
+ }
+
+ // Update child to include this layer.
+ if isUpper {
+ child.upperVD = childVD
+ child.copiedUp = 1
+ } else {
+ child.lowerVDs = append(child.lowerVDs, childVD)
+ }
+ if !existsOnAnyLayer {
+ existsOnAnyLayer = true
+ child.mode = uint32(stat.Mode)
+ child.uid = stat.UID
+ child.gid = stat.GID
+ child.devMajor = stat.DevMajor
+ child.devMinor = stat.DevMinor
+ child.ino = stat.Ino
+ }
+
+ // For non-directory files, only the topmost layer that contains a file
+ // matters.
+ if !isDir {
+ return false
+ }
+
+ // Directories are merged with directories from lower layers if they
+ // are not explicitly opaque.
+ opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.GetxattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Size: 1,
+ })
+ return !(err == nil && opaqueVal == "y")
+ })
+
+ if lookupErr != nil {
+ child.destroyLocked(ctx)
+ return nil, lookupErr
+ }
+ if !existsOnAnyLayer {
+ child.destroyLocked(ctx)
+ return nil, syserror.ENOENT
+ }
+
+ // Device and inode numbers were copied from the topmost layer above;
+ // override them if necessary.
+ if child.isDir() {
+ child.devMajor = linux.UNNAMED_MAJOR
+ child.devMinor = fs.dirDevMinor
+ child.ino = fs.newDirIno()
+ } else if !child.upperVD.Ok() {
+ child.devMajor = linux.UNNAMED_MAJOR
+ child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+ }
+
+ parent.IncRef()
+ child.parent = parent
+ child.name = name
+ return child, nil
+}
+
+// lookupLayerLocked is similar to lookupLocked, but only returns information
+// about the file rather than a dentry.
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
+ childPath := fspath.Parse(name)
+ lookupLayer := lookupLayerNone
+ var lookupErr error
+
+ parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+ stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parentVD,
+ Start: parentVD,
+ Path: childPath,
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_TYPE,
+ })
+ if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+ // The file doesn't exist on this layer. Proceed to the next
+ // one.
+ return true
+ }
+ if err != nil {
+ lookupErr = err
+ return false
+ }
+ if stat.Mask&linux.STATX_TYPE == 0 {
+ // Linux's overlayfs tends to return EREMOTE in cases where a file
+ // is unusable for reasons that are not better captured by another
+ // errno.
+ lookupErr = syserror.EREMOTE
+ return false
+ }
+ if isWhiteout(&stat) {
+ // This is a whiteout, so it "doesn't exist" on this layer, and
+ // layers below this one are ignored.
+ if isUpper {
+ lookupLayer = lookupLayerUpperWhiteout
+ }
+ return false
+ }
+ // The file exists; we can stop searching.
+ if isUpper {
+ lookupLayer = lookupLayerUpper
+ } else {
+ lookupLayer = lookupLayerLower
+ }
+ return false
+ })
+
+ return lookupLayer, lookupErr
+}
+
+type lookupLayer int
+
+const (
+ // lookupLayerNone indicates that no file exists at the given path on the
+ // upper layer, and is either whited out or does not exist on lower layers.
+ // Therefore, the file does not exist in the overlay filesystem, and file
+ // creation may proceed normally (if an upper layer exists).
+ lookupLayerNone lookupLayer = iota
+
+ // lookupLayerLower indicates that no file exists at the given path on the
+ // upper layer, but exists on a lower layer. Therefore, the file exists in
+ // the overlay filesystem, but must be copied-up before mutation.
+ lookupLayerLower
+
+ // lookupLayerUpper indicates that a non-whiteout file exists at the given
+ // path on the upper layer. Therefore, the file exists in the overlay
+ // filesystem, and is already copied-up.
+ lookupLayerUpper
+
+ // lookupLayerUpperWhiteout indicates that a whiteout exists at the given
+ // path on the upper layer. Therefore, the file does not exist in the
+ // overlay filesystem, and file creation must remove the whiteout before
+ // proceeding.
+ lookupLayerUpperWhiteout
+)
+
+func (ll lookupLayer) existsInOverlay() bool {
+ return ll == lookupLayerLower || ll == lookupLayerUpper
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EEXIST
+ }
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+ if parent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Determine if a file already exists at name.
+ if _, ok := parent.children[name]; ok {
+ return syserror.EEXIST
+ }
+ childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
+ if err != nil {
+ return err
+ }
+ if childLayer.existsInOverlay() {
+ return syserror.EEXIST
+ }
+
+ // Ensure that the parent directory is copied-up so that we can create the
+ // new file in the upper layer.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ // Finally create the new file.
+ if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
+ return err
+ }
+ parent.dirents = nil
+ return nil
+}
+
+// Preconditions: pop's parent directory has been copied up.
+func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error {
+ return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{
+ Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
+ // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
+ })
+}
+
+func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
+ if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+ }
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return d.checkPermissions(creds, ats)
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ layerVD := d.topLayer()
+ return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &opts)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ old := vd.Dentry().Impl().(*dentry)
+ if old.isDir() {
+ return syserror.EPERM
+ }
+ if err := old.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ newpop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: old.upperVD,
+ Start: old.upperVD,
+ }, &newpop); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+ }
+ return err
+ }
+ return nil
+ })
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ if haveUpperWhiteout {
+ // There may be directories on lower layers (previously hidden by
+ // the whiteout) that the new directory should not be merged with.
+ // Mark it opaque to prevent merging.
+ if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+ } else {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ }
+ return nil
+ })
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ // Disallow attempts to create whiteouts.
+ if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
+ return syserror.EPERM
+ }
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ return nil
+ })
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ mayCreate := opts.Flags&linux.O_CREAT != 0
+ mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ // Determine whether or not we need to create a file.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err == syserror.ENOENT && mayCreate {
+ fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+ parent.dirMu.Unlock()
+ return fd, err
+ }
+ if err != nil {
+ parent.dirMu.Unlock()
+ return nil, err
+ }
+ // Open existing child or follow symlink.
+ parent.dirMu.Unlock()
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ if ats.MayWrite() {
+ if err := d.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+ }
+ mnt := rp.Mount()
+
+ // Directory FDs open FDs from each layer when directory entries are read,
+ // so they don't require opening an FD from d.topLayer() up front.
+ ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+ if ftype == linux.S_IFDIR {
+ // Can't open directories with O_CREAT.
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EISDIR
+ }
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ fd := &directoryFD{}
+ fd.LockFD.Init(&d.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+ }
+
+ layerVD, isUpper := d.topLayerInfo()
+ layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, opts)
+ if err != nil {
+ return nil, err
+ }
+ layerFlags := layerFD.StatusFlags()
+ fd := &nonDirectoryFD{
+ copiedUp: isUpper,
+ cachedFD: layerFD,
+ cachedFlags: layerFlags,
+ }
+ fd.LockFD.Init(&d.locks)
+ layerFDOpts := layerFD.Options()
+ if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
+ layerFD.DecRef(ctx)
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Preconditions: parent.dirMu must be locked. parent does not already contain
+// a child named rp.Component().
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+ creds := rp.Credentials()
+ if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ if parent.vfsd.IsDead() {
+ return nil, syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer mnt.EndWrite()
+
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ childName := rp.Component()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ // We don't know if a whiteout exists on the upper layer; speculatively
+ // unlink it.
+ //
+ // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
+ // know whether a whiteout exists.
+ var haveUpperWhiteout bool
+ switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
+ case nil:
+ haveUpperWhiteout = true
+ case syserror.ENOENT:
+ haveUpperWhiteout = false
+ default:
+ return nil, err
+ }
+ // Create the file on the upper layer, and get an FD representing it.
+ upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
+ Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
+ Mode: opts.Mode,
+ })
+ if err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Change the file's owner to the caller. We can't use upperFD.SetStat()
+ // because it will pick up creds from ctx.
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Re-lookup to get a dentry representing the new file, which is needed for
+ // the returned FD.
+ child, err := fs.getChildLocked(ctx, parent, childName, ds)
+ if err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return nil, err
+ }
+ // Finally construct the overlay FD.
+ upperFlags := upperFD.StatusFlags()
+ fd := &nonDirectoryFD{
+ copiedUp: true,
+ cachedFD: upperFD,
+ cachedFlags: upperFlags,
+ }
+ fd.LockFD.Init(&child.locks)
+ upperFDOpts := upperFD.Options()
+ if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
+ upperFD.DecRef(ctx)
+ // Don't bother with cleanup; the file was created successfully, we
+ // just can't open it anymore for some reason.
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ layerVD := d.topLayer()
+ return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ })
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ if opts.Flags != 0 {
+ return syserror.EINVAL
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.Lock()
+ defer fs.renameMuUnlockAndCheckDrop(ctx, &ds)
+ newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+ if err != nil {
+ return err
+ }
+ newName := rp.Component()
+ if newName == "." || newName == ".." {
+ return syserror.EBUSY
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ // FIXME(gvisor.dev/issue/1199): Actually implement rename.
+ _ = newParent
+ return syserror.EXDEV
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ name := rp.Component()
+ if name == "." {
+ return syserror.EINVAL
+ }
+ if name == ".." {
+ return syserror.ENOTEMPTY
+ }
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Ensure that parent is copied-up before potentially holding child.copyMu
+ // below.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ // Unlike UnlinkAt, we need a dentry representing the child directory being
+ // removed in order to verify that it's empty.
+ child, err := fs.getChildLocked(ctx, parent, name, &ds)
+ if err != nil {
+ return err
+ }
+ if !child.isDir() {
+ return syserror.ENOTDIR
+ }
+ child.dirMu.Lock()
+ defer child.dirMu.Unlock()
+ whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ child.copyMu.RLock()
+ defer child.copyMu.RUnlock()
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ return err
+ }
+
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(name),
+ }
+ if child.upperVD.Ok() {
+ cleanupRecreateWhiteouts := func() {
+ if !child.upperVD.Ok() {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: child.upperVD,
+ Start: child.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+ }
+ }
+ }
+ // Remove existing whiteouts on the upper layer.
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.upperVD,
+ Start: child.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return err
+ }
+ }
+ // Remove the existing directory on the upper layer.
+ if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return err
+ }
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+ // Don't attempt to recover from this: the original directory is
+ // already gone, so any dentries representing it are invalid, and
+ // creating a new directory won't undo that.
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return err
+ }
+
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+ delete(parent.children, name)
+ ds = appendDentry(ds, child)
+ parent.dirents = nil
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ return err
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // Changes to d's attributes are serialized by d.copyMu.
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.upperVD,
+ Start: d.upperVD,
+ }, &opts); err != nil {
+ return err
+ }
+ d.updateAfterSetStatLocked(&opts)
+ return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+
+ var stat linux.Statx
+ if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+ layerVD := d.topLayer()
+ stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ }, &vfs.StatOptions{
+ Mask: layerMask,
+ Sync: opts.Sync,
+ })
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ d.statInternalTo(ctx, &opts, &stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ return fs.statFS(ctx)
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(childName),
+ }
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ return err
+ }
+ }
+ if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
+ if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ creds := rp.Credentials()
+ if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: linux.STATX_UID | linux.STATX_GID,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ },
+ }); err != nil {
+ if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+ } else if haveUpperWhiteout {
+ fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+ }
+ return err
+ }
+ return nil
+ })
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EISDIR
+ }
+ if rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ // Ensure that parent is copied-up before potentially holding child.copyMu
+ // below.
+ if err := parent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+
+ child := parent.children[name]
+ var childLayer lookupLayer
+ if child != nil {
+ if child.isDir() {
+ return syserror.EISDIR
+ }
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ return err
+ }
+ // Hold child.copyMu to prevent it from being copied-up during
+ // deletion.
+ child.copyMu.RLock()
+ defer child.copyMu.RUnlock()
+ if child.upperVD.Ok() {
+ childLayer = lookupLayerUpper
+ } else {
+ childLayer = lookupLayerLower
+ }
+ } else {
+ // Determine if the file being unlinked actually exists. Holding
+ // parent.dirMu prevents a dentry from being instantiated for the file,
+ // which in turn prevents it from being copied-up, so this result is
+ // stable.
+ childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+ if err != nil {
+ return err
+ }
+ if !childLayer.existsInOverlay() {
+ return syserror.ENOENT
+ }
+ }
+
+ pop := vfs.PathOperation{
+ Root: parent.upperVD,
+ Start: parent.upperVD,
+ Path: fspath.Parse(name),
+ }
+ if childLayer == lookupLayerUpper {
+ // Remove the existing file on the upper layer.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return err
+ }
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+ ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return err
+ }
+
+ if child != nil {
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+ delete(parent.children, name)
+ ds = appendDentry(ds, child)
+ }
+ parent.dirents = nil
+ return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
+ // but not any other xattr syscalls. For now we just reject all of them.
+ return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ _, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
new file mode 100644
index 000000000..d3060a481
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isSymlink() bool {
+ return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+ layerVD := d.topLayer()
+ return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: layerVD,
+ Start: layerVD,
+ })
+}
+
+type nonDirectoryFD struct {
+ fileDescription
+
+ // If copiedUp is false, cachedFD represents
+ // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+ // fileDescription.dentry().upperVD. cachedFlags is the last known value of
+ // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+ // protected by mu.
+ mu sync.Mutex
+ copiedUp bool
+ cachedFD *vfs.FileDescription
+ cachedFlags uint32
+}
+
+func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return nil, err
+ }
+ wrappedFD.IncRef()
+ return wrappedFD, nil
+}
+
+func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+ d := fd.dentry()
+ statusFlags := fd.vfsfd.StatusFlags()
+ if !fd.copiedUp && d.isCopiedUp() {
+ // Switch to the copied-up file.
+ upperVD := d.topLayer()
+ upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: upperVD,
+ Start: upperVD,
+ }, &vfs.OpenOptions{
+ Flags: statusFlags,
+ })
+ if err != nil {
+ return nil, err
+ }
+ oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+ if oldOffErr == nil {
+ if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+ upperFD.DecRef(ctx)
+ return nil, err
+ }
+ }
+ fd.cachedFD.DecRef(ctx)
+ fd.copiedUp = true
+ fd.cachedFD = upperFD
+ fd.cachedFlags = statusFlags
+ } else if fd.cachedFlags != statusFlags {
+ if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+ return nil, err
+ }
+ fd.cachedFlags = statusFlags
+ }
+ return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nonDirectoryFD) Release(ctx context.Context) {
+ fd.cachedFD.DecRef(ctx)
+ fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
+ // Linux doesn't define ovl_file_operations.flush at all (i.e. its
+ // equivalent to OnClose is a no-op). We pass through to
+ // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+ // copied-up, since OnClose is mostly used to define post-close writeback,
+ // and if fd.cachedFD hasn't been updated then it can't have been used to
+ // mutate fd.dentry() anyway.
+ fd.mu.Lock()
+ if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+ if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+ fd.mu.Unlock()
+ return err
+ }
+ fd.cachedFlags = statusFlags
+ }
+ wrappedFD := fd.cachedFD
+ defer wrappedFD.IncRef()
+ fd.mu.Unlock()
+ return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+ Mask: layerMask,
+ Sync: opts.Sync,
+ })
+ wrappedFD.DecRef(ctx)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ fd.dentry().statInternalTo(ctx, &opts, &stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ d := fd.dentry()
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ return err
+ }
+ mnt := fd.vfsfd.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // Changes to d's attributes are serialized by d.copyMu.
+ d.copyMu.Lock()
+ defer d.copyMu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return err
+ }
+ if err := wrappedFD.SetStat(ctx, opts); err != nil {
+ return err
+ }
+ d.updateAfterSetStatLocked(&opts)
+ return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+ return fd.filesystem().statFS(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ // Hold fd.mu during the read to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return 0, err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ // Hold fd.mu during the write to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Hold fd.mu during the seek to serialize the file offset.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ return 0, err
+ }
+ return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
+ fd.mu.Lock()
+ if !fd.dentry().isCopiedUp() {
+ fd.mu.Unlock()
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
+ if err != nil {
+ fd.mu.Unlock()
+ return err
+ }
+ wrappedFD.IncRef()
+ defer wrappedFD.DecRef(ctx)
+ fd.mu.Unlock()
+ return wrappedFD.Sync(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.ConfigureMMap(ctx, opts)
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
new file mode 100644
index 000000000..75cc006bf
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -0,0 +1,627 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package overlay provides an overlay filesystem implementation, which
+// synthesizes a filesystem by composing one or more immutable filesystems
+// ("lower layers") with an optional mutable filesystem ("upper layer").
+//
+// Lock order:
+//
+// directoryFD.mu / nonDirectoryFD.mu
+// filesystem.renameMu
+// dentry.dirMu
+// dentry.copyMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package overlay
+
+import (
+ "strings"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "overlay"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
+// FilesystemType.GetFilesystem.
+type FilesystemOptions struct {
+ // Callers passing FilesystemOptions to
+ // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
+ // the vfs.Mounts comprising the layers of the overlay filesystem do not
+ // contain submounts.
+
+ // If UpperRoot.Ok(), it is the root of the writable upper layer of the
+ // overlay.
+ UpperRoot vfs.VirtualDentry
+
+ // LowerRoots contains the roots of the immutable lower layers of the
+ // overlay. LowerRoots is immutable.
+ LowerRoots []vfs.VirtualDentry
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // Immutable options.
+ opts FilesystemOptions
+
+ // creds is a copy of the filesystem's creator's credentials, which are
+ // used for accesses to the filesystem's layers. creds is immutable.
+ creds *auth.Credentials
+
+ // dirDevMinor is the device minor number used for directories. dirDevMinor
+ // is immutable.
+ dirDevMinor uint32
+
+ // lowerDevMinors maps lower layer filesystems to device minor numbers
+ // assigned to non-directory files originating from that filesystem.
+ // lowerDevMinors is immutable.
+ lowerDevMinors map[*vfs.Filesystem]uint32
+
+ // renameMu synchronizes renaming with non-renaming operations in order to
+ // ensure consistent lock ordering between dentry.dirMu in different
+ // dentries.
+ renameMu sync.RWMutex
+
+ // lastDirIno is the last inode number assigned to a directory. lastDirIno
+ // is accessed using atomic memory operations.
+ lastDirIno uint64
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ fsoptsRaw := opts.InternalData
+ fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
+ if fsoptsRaw != nil && !haveFSOpts {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+ return nil, nil, syserror.EINVAL
+ }
+ if haveFSOpts {
+ if len(fsopts.LowerRoots) == 0 {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+ return nil, nil, syserror.EINVAL
+ }
+ if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+ return nil, nil, syserror.EINVAL
+ }
+ // We don't enforce a maximum number of lower layers when not
+ // configured by applications; the sandbox owner can have an overlay
+ // filesystem with any number of lower layers.
+ } else {
+ vfsroot := vfs.RootFromContext(ctx)
+ defer vfsroot.DecRef(ctx)
+ upperPathname, ok := mopts["upperdir"]
+ if ok {
+ delete(mopts, "upperdir")
+ // Linux overlayfs also requires a workdir when upperdir is
+ // specified; we don't, so silently ignore this option.
+ delete(mopts, "workdir")
+ upperPath := fspath.Parse(upperPathname)
+ if !upperPath.Absolute {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+ return nil, nil, syserror.EINVAL
+ }
+ upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+ Root: vfsroot,
+ Start: vfsroot,
+ Path: upperPath,
+ FollowFinalSymlink: true,
+ }, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+ return nil, nil, err
+ }
+ defer upperRoot.DecRef(ctx)
+ privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+ if err != nil {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+ return nil, nil, err
+ }
+ defer privateUpperRoot.DecRef(ctx)
+ fsopts.UpperRoot = privateUpperRoot
+ }
+ lowerPathnamesStr, ok := mopts["lowerdir"]
+ if !ok {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "lowerdir")
+ lowerPathnames := strings.Split(lowerPathnamesStr, ":")
+ const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+ if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+ return nil, nil, syserror.EINVAL
+ }
+ if len(lowerPathnames) > maxLowerLayers {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+ return nil, nil, syserror.EINVAL
+ }
+ for _, lowerPathname := range lowerPathnames {
+ lowerPath := fspath.Parse(lowerPathname)
+ if !lowerPath.Absolute {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+ return nil, nil, syserror.EINVAL
+ }
+ lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+ Root: vfsroot,
+ Start: vfsroot,
+ Path: lowerPath,
+ FollowFinalSymlink: true,
+ }, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+ return nil, nil, err
+ }
+ defer lowerRoot.DecRef(ctx)
+ privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+ if err != nil {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+ return nil, nil, err
+ }
+ defer privateLowerRoot.DecRef(ctx)
+ fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
+ }
+ }
+ if len(mopts) != 0 {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Allocate device numbers.
+ dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ return nil, nil, err
+ }
+ lowerDevMinors := make(map[*vfs.Filesystem]uint32)
+ for _, lowerRoot := range fsopts.LowerRoots {
+ lowerFS := lowerRoot.Mount().Filesystem()
+ if _, ok := lowerDevMinors[lowerFS]; !ok {
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ vfsObj.PutAnonBlockDevMinor(dirDevMinor)
+ for _, lowerDevMinor := range lowerDevMinors {
+ vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+ }
+ return nil, nil, err
+ }
+ lowerDevMinors[lowerFS] = devMinor
+ }
+ }
+
+ // Take extra references held by the filesystem.
+ if fsopts.UpperRoot.Ok() {
+ fsopts.UpperRoot.IncRef()
+ }
+ for _, lowerRoot := range fsopts.LowerRoots {
+ lowerRoot.IncRef()
+ }
+
+ fs := &filesystem{
+ opts: fsopts,
+ creds: creds.Fork(),
+ dirDevMinor: dirDevMinor,
+ lowerDevMinors: lowerDevMinors,
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ root := fs.newDentry()
+ root.refs = 1
+ if fs.opts.UpperRoot.Ok() {
+ fs.opts.UpperRoot.IncRef()
+ root.copiedUp = 1
+ root.upperVD = fs.opts.UpperRoot
+ }
+ for _, lowerRoot := range fs.opts.LowerRoots {
+ lowerRoot.IncRef()
+ root.lowerVDs = append(root.lowerVDs, lowerRoot)
+ }
+ rootTopVD := root.topLayer()
+ // Get metadata from the topmost layer. See fs.lookupLocked().
+ const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: rootTopVD,
+ Start: rootTopVD,
+ }, &vfs.StatOptions{
+ Mask: rootStatMask,
+ })
+ if err != nil {
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
+ }
+ if rootStat.Mask&rootStatMask != rootStatMask {
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, syserror.EREMOTE
+ }
+ if isWhiteout(&rootStat) {
+ ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, syserror.EINVAL
+ }
+ root.mode = uint32(rootStat.Mode)
+ root.uid = rootStat.UID
+ root.gid = rootStat.GID
+ if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
+ root.devMajor = linux.UNNAMED_MAJOR
+ root.devMinor = fs.dirDevMinor
+ root.ino = fs.newDirIno()
+ } else if !root.upperVD.Ok() {
+ root.devMajor = linux.UNNAMED_MAJOR
+ root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+ root.ino = rootStat.Ino
+ } else {
+ root.devMajor = rootStat.DevMajor
+ root.devMinor = rootStat.DevMinor
+ root.ino = rootStat.Ino
+ }
+
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
+// associated with any MountNamespace, and returns the root of the new mount.
+// (This is required to ensure that each layer of an overlay comprises only a
+// single mount, and therefore can't cross into e.g. the overlay filesystem
+// itself, risking lock recursion.) A reference is held on the returned
+// VirtualDentry.
+func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
+ oldmnt := vd.Mount()
+ opts := oldmnt.Options()
+ if forceReadOnly {
+ opts.ReadOnly = true
+ }
+ newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
+ if err != nil {
+ return vfs.VirtualDentry{}, err
+ }
+ return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+ vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
+ for _, lowerDevMinor := range fs.lowerDevMinors {
+ vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+ }
+ if fs.opts.UpperRoot.Ok() {
+ fs.opts.UpperRoot.DecRef(ctx)
+ }
+ for _, lowerRoot := range fs.opts.LowerRoots {
+ lowerRoot.DecRef(ctx)
+ }
+}
+
+func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
+ // Always statfs the root of the topmost layer. Compare Linux's
+ // fs/overlayfs/super.c:ovl_statfs().
+ var rootVD vfs.VirtualDentry
+ if fs.opts.UpperRoot.Ok() {
+ rootVD = fs.opts.UpperRoot
+ } else {
+ rootVD = fs.opts.LowerRoots[0]
+ }
+ fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: rootVD,
+ Start: rootVD,
+ })
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
+ return fsstat, nil
+}
+
+func (fs *filesystem) newDirIno() uint64 {
+ return atomic.AddUint64(&fs.lastDirIno, 1)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+ vfsd vfs.Dentry
+
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // mode, uid, and gid are the file mode, owner, and group of the file in
+ // the topmost layer (and therefore the overlay file as well), and are used
+ // for permission checks on this dentry. These fields are protected by
+ // copyMu and accessed using atomic memory operations.
+ mode uint32
+ uid uint32
+ gid uint32
+
+ // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
+ // 0 otherwise. copiedUp is accessed using atomic memory operations.
+ copiedUp uint32
+
+ // parent is the dentry corresponding to this dentry's parent directory.
+ // name is this dentry's name in parent. If this dentry is a filesystem
+ // root, parent is nil and name is the empty string. parent and name are
+ // protected by fs.renameMu.
+ parent *dentry
+ name string
+
+ // If this dentry represents a directory, children maps the names of
+ // children for which dentries have been instantiated to those dentries,
+ // and dirents (if not nil) is a cache of dirents as returned by
+ // directoryFDs representing this directory. children is protected by
+ // dirMu.
+ dirMu sync.Mutex
+ children map[string]*dentry
+ dirents []vfs.Dirent
+
+ // upperVD and lowerVDs are the files from the overlay filesystem's layers
+ // that comprise the file on the overlay filesystem.
+ //
+ // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
+ // be copied up) with copyMu locked for writing; otherwise, it is
+ // immutable. lowerVDs is always immutable.
+ copyMu sync.RWMutex
+ upperVD vfs.VirtualDentry
+ lowerVDs []vfs.VirtualDentry
+
+ // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
+ // len(inlineLowerVDs).
+ inlineLowerVDs [1]vfs.VirtualDentry
+
+ // devMajor, devMinor, and ino are the device major/minor and inode numbers
+ // used by this dentry. These fields are protected by copyMu and accessed
+ // using atomic memory operations.
+ devMajor uint32
+ devMinor uint32
+ ino uint64
+
+ locks vfs.FileLocks
+}
+
+// newDentry creates a new dentry. The dentry initially has no references; it
+// is the caller's responsibility to set the dentry's reference count and/or
+// call dentry.destroy() as appropriate. The dentry is initially invalid in
+// that it contains no layers; the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+ d := &dentry{
+ fs: fs,
+ }
+ d.lowerVDs = d.inlineLowerVDs[:0]
+ d.vfsd.Init(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ // d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+ // d.checkDropLocked().
+ atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&d.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+ if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+ d.fs.renameMu.Lock()
+ d.checkDropLocked(ctx)
+ d.fs.renameMu.Unlock()
+ } else if refs < 0 {
+ panic("overlay.dentry.DecRef() called without holding a reference")
+ }
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+ // Dentries with a positive reference count must be retained. (The only way
+ // to obtain a reference on a dentry with zero references is via path
+ // resolution, which requires renameMu, so if d.refs is zero then it will
+ // remain zero while we hold renameMu for writing.) Dentries with a
+ // negative reference count have already been destroyed.
+ if atomic.LoadInt64(&d.refs) != 0 {
+ return
+ }
+ // Refs is still zero; destroy it.
+ d.destroyLocked(ctx)
+ return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("overlay.dentry.destroyLocked() called with references on the dentry")
+ }
+
+ if d.upperVD.Ok() {
+ d.upperVD.DecRef(ctx)
+ }
+ for _, lowerVD := range d.lowerVDs {
+ lowerVD.DecRef(ctx)
+ }
+
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ if !d.vfsd.IsDead() {
+ delete(d.parent.children, d.name)
+ }
+ d.parent.dirMu.Unlock()
+ // Drop the reference held by d on its parent without recursively
+ // locking d.fs.renameMu.
+ if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+ d.parent.checkDropLocked(ctx)
+ } else if refs < 0 {
+ panic("overlay.dentry.DecRef() called without holding a reference")
+ }
+ }
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
+ // TODO(gvisor.dev/issue/1479): Implement inotify.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ // TODO(gvisor.dev/issue/1479): Implement inotify.
+ return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) OnZeroWatches(context.Context) {}
+
+// iterLayers invokes yield on each layer comprising d, from top to bottom. If
+// any call to yield returns false, iterLayer stops iteration.
+func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
+ if d.isCopiedUp() {
+ if !yield(d.upperVD, true) {
+ return
+ }
+ }
+ for _, lowerVD := range d.lowerVDs {
+ if !yield(lowerVD, false) {
+ return
+ }
+ }
+}
+
+func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
+ if d.isCopiedUp() {
+ return d.upperVD, true
+ }
+ return d.lowerVDs[0], false
+}
+
+func (d *dentry) topLayer() vfs.VirtualDentry {
+ vd, _ := d.topLayerInfo()
+ return vd
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// statInternalMask is the set of stat fields that is set by
+// dentry.statInternalTo().
+const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+
+// statInternalTo writes fields to stat that are stored in d, and therefore do
+// not requiring invoking StatAt on the overlay's layers.
+func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
+ stat.Mask |= statInternalMask
+ if d.isDir() {
+ // Linux sets nlink to 1 for merged directories
+ // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
+ // correct more often ("." and the directory's entry in its parent),
+ // and some of our tests expect this.
+ stat.Nlink = 2
+ }
+ stat.UID = atomic.LoadUint32(&d.uid)
+ stat.GID = atomic.LoadUint32(&d.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+ stat.Ino = atomic.LoadUint64(&d.ino)
+ stat.DevMajor = atomic.LoadUint32(&d.devMajor)
+ stat.DevMinor = atomic.LoadUint32(&d.devMinor)
+}
+
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
+ if opts.Stat.Mask&linux.STATX_MODE != 0 {
+ atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
+ }
+ if opts.Stat.Mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&d.uid, opts.Stat.UID)
+ }
+ if opts.Stat.Mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&d.gid, opts.Stat.GID)
+ }
+}
+
+// fileDescription is embedded by overlay implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index cab771211..2ca793db9 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -63,9 +63,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
@@ -81,7 +81,8 @@ type inode struct {
kernfs.InodeNotSymlink
kernfs.InodeNoopRefCount
- pipe *pipe.VFSPipe
+ locks vfs.FileLocks
+ pipe *pipe.VFSPipe
ino uint64
uid auth.KUID
@@ -114,7 +115,7 @@ func (i *inode) Mode() linux.FileMode {
}
// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
return linux.Statx{
Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
@@ -147,7 +148,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+ return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
}
// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
@@ -159,6 +160,6 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
inode := newInode(ctx, fs)
var d kernfs.Dentry
d.Init(inode)
- defer d.DecRef()
+ defer d.DecRef(ctx)
return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 17c1342b5..6014138ff 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -22,6 +22,7 @@ go_library(
"//pkg/log",
"//pkg/refs",
"//pkg/safemem",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsbridge",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/inet",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 609210253..2463d51cd 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -77,9 +77,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// dynamicInode is an overfitted interface for common Inodes with
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 36a911db4..79c2725f3 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -37,6 +37,8 @@ type subtasksInode struct {
kernfs.OrderedChildren
kernfs.AlwaysValid
+ locks vfs.FileLocks
+
fs *filesystem
task *kernel.Task
pidns *kernel.PIDNamespace
@@ -126,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac
return fd.GenericDirectoryFD.IterDirents(ctx, cb)
}
-// Seek implements vfs.FileDecriptionImpl.Seek.
+// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
if fd.task.ExitState() >= kernel.TaskExitZombie {
return 0, syserror.ENOENT
@@ -153,7 +155,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
// Open implements kernfs.Inode.
func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &subtasksFD{task: i.task}
- if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+ if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
return nil, err
}
if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -163,8 +165,8 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v
}
// Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
return linux.Statx{}, err
}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 482055db1..a5c7aa470 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -38,6 +38,8 @@ type taskInode struct {
kernfs.InodeAttrs
kernfs.OrderedChildren
+ locks vfs.FileLocks
+
task *kernel.Task
}
@@ -103,7 +105,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
// Open implements kernfs.Inode.
func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
if err != nil {
return nil, err
}
@@ -154,8 +156,8 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
}
// Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.Inode.Stat(fs, opts)
+func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.Inode.Stat(ctx, fs, opts)
if err != nil {
return linux.Statx{}, err
}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 44ccc9e4a..f0d3f7f5e 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -43,16 +43,18 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags)
return file, flags
}
-func taskFDExists(t *kernel.Task, fd int32) bool {
+func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
file, _ := getTaskFD(t, fd)
if file == nil {
return false
}
- file.DecRef()
+ file.DecRef(ctx)
return true
}
type fdDir struct {
+ locks vfs.FileLocks
+
fs *filesystem
task *kernel.Task
@@ -62,15 +64,14 @@ type fdDir struct {
}
// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.GetFDs()
+ fds = fdTable.GetFDs(ctx)
}
})
- offset := absOffset + relOffset
typ := uint8(linux.DT_REG)
if i.produceSymlink {
typ = linux.DT_LNK
@@ -134,7 +135,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(i.task, fd) {
+ if !taskFDExists(ctx, i.task, fd) {
return nil, syserror.ENOENT
}
taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
@@ -143,7 +144,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
// Open implements kernfs.Inode.
func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
if err != nil {
return nil, err
}
@@ -203,9 +204,9 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
if file == nil {
return "", syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
root := vfs.RootFromContext(ctx)
- defer root.DecRef()
+ defer root.DecRef(ctx)
return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
}
@@ -214,7 +215,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
if file == nil {
return vfs.VirtualDentry{}, "", syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
vd := file.VirtualDentry()
vd.IncRef()
return vd, "", nil
@@ -257,7 +258,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(i.task, fd) {
+ if !taskFDExists(ctx, i.task, fd) {
return nil, syserror.ENOENT
}
data := &fdInfoData{
@@ -270,7 +271,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
// Open implements kernfs.Inode.
func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
if err != nil {
return nil, err
}
@@ -296,7 +297,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if file == nil {
return syserror.ENOENT
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// TODO(b/121266871): Include pos, locks, and other data. For now we only
// have flags.
// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2f297e48a..830b78949 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -34,6 +35,10 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
// mm gets the kernel task's MemoryManager. No additional reference is taken on
// mm here. This is safe because MemoryManager.destroy is required to leave the
// MemoryManager in a state where it's still usable as a DynamicBytesSource.
@@ -226,8 +231,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Linux will return envp up to and including the first NULL character,
// so find it.
- if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 {
- buf.Truncate(end)
+ envStart := int(ar.Length())
+ if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
+ buf.Truncate(envStart + nullIdx)
}
}
@@ -282,7 +288,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
-// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}.
+// idMapData implements vfs.WritableDynamicBytesSource for
+// /proc/[pid]/{gid_map|uid_map}.
//
// +stateify savable
type idMapData struct {
@@ -294,7 +301,7 @@ type idMapData struct {
var _ dynamicInode = (*idMapData)(nil)
-// Generate implements vfs.DynamicBytesSource.Generate.
+// Generate implements vfs.WritableDynamicBytesSource.Generate.
func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var entries []auth.IDMapEntry
if d.gids {
@@ -308,6 +315,60 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ // "In addition, the number of bytes written to the file must be less than
+ // the system page size, and the write must be performed at the start of
+ // the file ..." - user_namespaces(7)
+ srclen := src.NumBytes()
+ if srclen >= usermem.PageSize || offset != 0 {
+ return 0, syserror.EINVAL
+ }
+ b := make([]byte, srclen)
+ if _, err := src.CopyIn(ctx, b); err != nil {
+ return 0, err
+ }
+
+ // Truncate from the first NULL byte.
+ var nul int64
+ nul = int64(bytes.IndexByte(b, 0))
+ if nul == -1 {
+ nul = srclen
+ }
+ b = b[:nul]
+ // Remove the last \n.
+ if nul >= 1 && b[nul-1] == '\n' {
+ b = b[:nul-1]
+ }
+ lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
+ if len(lines) > maxIDMapLines {
+ return 0, syserror.EINVAL
+ }
+
+ entries := make([]auth.IDMapEntry, len(lines))
+ for i, l := range lines {
+ var e auth.IDMapEntry
+ _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
+ if err != nil {
+ return 0, syserror.EINVAL
+ }
+ entries[i] = e
+ }
+ var err error
+ if d.gids {
+ err = d.task.UserNamespace().SetGIDMap(ctx, entries)
+ } else {
+ err = d.task.UserNamespace().SetUIDMap(ctx, entries)
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // On success, Linux's kernel/user_namespace.c:map_write() always returns
+ // count, even if fewer bytes were used.
+ return int64(srclen), nil
+}
+
// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
//
// +stateify savable
@@ -616,7 +677,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
if err != nil {
return "", err
}
- defer exec.DecRef()
+ defer exec.DecRef(ctx)
return exec.PathnameWithDeleted(ctx), nil
}
@@ -631,7 +692,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
if err != nil {
return vfs.VirtualDentry{}, "", err
}
- defer exec.DecRef()
+ defer exec.DecRef(ctx)
vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
vd.IncRef()
@@ -687,7 +748,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(ctx)
i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
return nil
}
@@ -718,7 +779,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef()
+ defer rootDir.DecRef(ctx)
i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
return nil
}
@@ -764,7 +825,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
dentry.Init(&namespaceInode{})
vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
vd.IncRef()
- dentry.DecRef()
+ dentry.DecRef(ctx)
return vd, "", nil
}
@@ -775,6 +836,8 @@ type namespaceInode struct {
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
+
+ locks vfs.FileLocks
}
var _ kernfs.Inode = (*namespaceInode)(nil)
@@ -791,6 +854,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &namespaceFD{inode: i}
i.IncRef()
+ fd.LockFD.Init(&i.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
@@ -801,6 +865,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
// /proc/[pid]/ns/*.
type namespaceFD struct {
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
vfsfd vfs.FileDescription
inode *namespaceInode
@@ -811,7 +876,7 @@ var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
// Stat implements FileDescriptionImpl.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
- return fd.inode.Stat(vfs, opts)
+ return fd.inode.Stat(ctx, vfs, opts)
}
// SetStat implements FileDescriptionImpl.
@@ -822,11 +887,16 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err
}
// Release implements FileDescriptionImpl.
-func (fd *namespaceFD) Release() {
- fd.inode.DecRef()
+func (fd *namespaceFD) Release(ctx context.Context) {
+ fd.inode.DecRef(ctx)
}
-// OnClose implements FileDescriptionImpl.
-func (*namespaceFD) OnClose(context.Context) error {
- return nil
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 6bde27376..a4c884bf9 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
continue
}
if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
- s.DecRef()
+ s.DecRef(ctx)
// Not a unix socket.
continue
}
@@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
}
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
}
@@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
}
if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
- s.DecRef()
+ s.DecRef(ctx)
// Not tcp4 sockets.
continue
}
@@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
@@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
}
if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
- s.DecRef()
+ s.DecRef(ctx)
// Not udp4 socket.
continue
}
@@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "\n")
- s.DecRef()
+ s.DecRef(ctx)
}
return nil
}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b51d43954..6d2b90a8b 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -43,6 +43,8 @@ type tasksInode struct {
kernfs.OrderedChildren
kernfs.AlwaysValid
+ locks vfs.FileLocks
+
fs *filesystem
pidns *kernel.PIDNamespace
@@ -197,15 +199,15 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
// Open implements kernfs.Inode.
func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- stat, err := i.InodeAttrs.Stat(vsfs, opts)
+func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
return linux.Statx{}, err
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6dac2afa4..b71778128 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -55,7 +55,8 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
if stack := k.RootNetworkNamespace().Stack(); stack != nil {
contents = map[string]*kernfs.Dentry{
"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+ "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
+ "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -207,3 +208,49 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
*d.enabled = v != 0
return n, d.stack.SetTCPSACKEnabled(*d.enabled)
}
+
+// tcpRecoveryData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_recovery.
+//
+// +stateify savable
+type tcpRecoveryData struct {
+ kernfs.DynamicBytesFile
+
+ stack inet.Stack `state:"wait"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ recovery, err := d.stack.TCPRecovery()
+ if err != nil {
+ return err
+ }
+
+ buf.WriteString(fmt.Sprintf("%d\n", recovery))
+ return nil
+}
+
+func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Limit the amount of memory allocated.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 19abb5034..3c9297dee 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -218,7 +218,7 @@ func TestTasks(t *testing.T) {
if err != nil {
t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
buf := make([]byte, 1)
bufIOSeq := usermem.BytesIOSequence(buf)
if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
@@ -336,7 +336,7 @@ func TestTasksOffset(t *testing.T) {
if err != nil {
t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
}
@@ -441,7 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F
t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err)
continue
}
- defer child.DecRef()
+ defer child.DecRef(ctx)
stat, err := child.Stat(ctx, vfs.StatOptions{})
if err != nil {
t.Errorf("Stat(%v) failed: %v", absPath, err)
@@ -476,7 +476,7 @@ func TestTree(t *testing.T) {
if err != nil {
t.Fatalf("failed to create test file: %v", err)
}
- defer file.DecRef()
+ defer file.DecRef(s.Ctx)
var tasks []*kernel.Task
for i := 0; i < 5; i++ {
@@ -501,5 +501,5 @@ func TestTree(t *testing.T) {
t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err)
}
iterateDir(ctx, t, s, fd)
- fd.DecRef()
+ fd.DecRef(ctx)
}
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index d29ef3f83..6297e1df4 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -31,6 +31,7 @@ type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
// target is the original signal target task.
//
@@ -53,7 +54,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil)
// New creates a new signal fd.
func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[signalfd]")
- defer vd.DecRef()
+ defer vd.DecRef(target)
sfd := &SignalFileDescription{
target: target,
mask: mask,
@@ -132,4 +133,4 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
}
// Release implements FileDescriptionImpl.Release()
-func (sfd *SignalFileDescription) Release() {}
+func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index ee0828a15..c61818ff6 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -67,9 +67,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index a741e2bb6..1b548ccd4 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -29,6 +29,6 @@ go_test(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0af373604..0401726b6 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -30,6 +30,7 @@ import (
// Name is the default filesystem name.
const Name = "sysfs"
+const defaultSysDirMode = linux.FileMode(0755)
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
@@ -57,9 +58,6 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
devMinor: devMinor,
}
fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
- k := kernel.KernelFromContext(ctx)
- maxCPUCores := k.ApplicationCores()
- defaultSysDirMode := linux.FileMode(0755)
root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
"block": fs.newDir(creds, defaultSysDirMode, nil),
@@ -70,11 +68,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
"dev": fs.newDir(creds, defaultSysDirMode, nil),
"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
- "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
- }),
+ "cpu": cpuDir(ctx, fs, creds),
}),
}),
"firmware": fs.newDir(creds, defaultSysDirMode, nil),
@@ -86,10 +80,24 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return fs.VFSFilesystem(), root.VFSDentry(), nil
}
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+ k := kernel.KernelFromContext(ctx)
+ maxCPUCores := k.ApplicationCores()
+ children := map[string]*kernfs.Dentry{
+ "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+ "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+ "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+ }
+ for i := uint(0); i < maxCPUCores; i++ {
+ children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil)
+ }
+ return fs.newDir(creds, defaultSysDirMode, children)
+}
+
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
- fs.Filesystem.Release()
+ fs.Filesystem.Release(ctx)
}
// dir implements kernfs.Inode.
@@ -98,8 +106,10 @@ type dir struct {
kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
kernfs.InodeDirectoryNoNewChildren
-
kernfs.OrderedChildren
+
+ locks vfs.FileLocks
+
dentry kernfs.Dentry
}
@@ -121,7 +131,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
// Open implements kernfs.Inode.Open.
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
if err != nil {
return nil, err
}
@@ -136,7 +146,7 @@ type cpuFile struct {
// Generate implements vfs.DynamicBytesSource.Generate.
func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "0-%d", c.maxCores-1)
+ fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
return nil
}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 4b3602d47..9fd38b295 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) {
k := kernel.KernelFromContext(s.Ctx)
maxCPUCores := k.ApplicationCores()
- expected := fmt.Sprintf("0-%d", maxCPUCores-1)
+ expected := fmt.Sprintf("0-%d\n", maxCPUCores-1)
for _, fname := range []string{"online", "possible", "present"} {
pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
@@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) {
if err != nil {
t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
content, err := s.ReadToEnd(fd)
if err != nil {
t.Fatalf("Read failed: %v", err)
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 0e4053a46..400a97996 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -32,6 +32,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/usermem",
- "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
],
)
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index c16a36cdb..1813269e0 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) {
return nil, fmt.Errorf("creating platform: %v", err)
}
+ kernel.VFS2Enabled = true
k := &kernel.Kernel{
Platform: plat,
}
@@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) {
k.SetMemoryFile(mf)
// Pass k as the platform since it is savable, unlike the actual platform.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
- kernel.VFS2Enabled = true
-
- if err := k.VFS().Init(); err != nil {
- return nil, fmt.Errorf("VFS init: %v", err)
- }
k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
@@ -126,12 +122,16 @@ func Boot() (*kernel.Kernel, error) {
// CreateTask creates a new bare bones task for tests.
func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ return nil, fmt.Errorf("cannot find kernel from context")
+ }
+
exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root)
if err != nil {
return nil, err
}
m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
- m.SetExecutable(fsbridge.NewVFSFile(exe))
+ m.SetExecutable(ctx, fsbridge.NewVFSFile(exe))
config := &kernel.TaskConfig{
Kernel: k,
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 0556af877..568132121 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -97,8 +97,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
// Destroy release resources associated with a test system.
func (s *System) Destroy() {
- s.Root.DecRef()
- s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
+ s.Root.DecRef(s.Ctx)
+ s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem.
}
// ReadToEnd reads the contents of fd until EOF to a string.
@@ -149,7 +149,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
if err != nil {
s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(s.Ctx)
collector := &DirentCollector{}
if err := fd.IterDirents(s.Ctx, collector); err != nil {
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 60c92d626..86beaa0a8 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -32,6 +32,7 @@ type TimerFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
events waiter.Queue
timer *ktime.Timer
@@ -46,9 +47,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil)
var _ ktime.TimerListener = (*TimerFileDescription)(nil)
// New returns a new timer fd.
-func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
vd := vfsObj.NewAnonVirtualDentry("[timerfd]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
tfd := &TimerFileDescription{}
tfd.timer = ktime.NewTimer(clock, tfd)
if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
@@ -128,7 +129,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
}
// Release implements FileDescriptionImpl.Release()
-func (tfd *TimerFileDescription) Release() {
+func (tfd *TimerFileDescription) Release(context.Context) {
tfd.timer.Destroy()
}
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 007be1572..5cd428d64 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -26,6 +26,17 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "tmpfs",
+ prefix = "inode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
go_library(
name = "tmpfs",
srcs = [
@@ -34,6 +45,7 @@ go_library(
"directory.go",
"filesystem.go",
"fstree.go",
+ "inode_refs.go",
"named_pipe.go",
"regular_file.go",
"socket_file.go",
@@ -47,6 +59,7 @@ go_library(
"//pkg/context",
"//pkg/fspath",
"//pkg/log",
+ "//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs",
@@ -59,9 +72,9 @@ go_library(
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
"//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/uniqueid",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
- "//pkg/sentry/vfs/lock",
"//pkg/sentry/vfs/memxattr",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 2fb5c4d84..d263147c2 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent
}
err = fn(root, d)
- d.DecRef()
+ d.DecRef(ctx)
return err
}
@@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create mount namespace: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create nested directories with given depth.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
d := root
d.IncRef()
- defer d.DecRef()
+ defer d.DecRef(ctx)
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- d.DecRef()
+ d.DecRef(ctx)
d = next
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- file.DecRef()
+ file.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -176,7 +176,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
b.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
@@ -186,14 +186,14 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create nested directories with given depth.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
vd := root
vd.IncRef()
for i := depth; i > 0; i-- {
@@ -212,7 +212,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- vd.DecRef()
+ vd.DecRef(ctx)
vd = nextVD
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -228,12 +228,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
Mode: 0644,
})
- vd.DecRef()
+ vd.DecRef(ctx)
vd = vfs.VirtualDentry{}
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -278,14 +278,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create mount namespace: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create and mount the submount.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
b.Fatalf("failed to create mount point: %v", err)
}
@@ -293,7 +293,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
if err != nil {
b.Fatalf("failed to create tmpfs submount: %v", err)
@@ -309,7 +309,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount root: %v", err)
}
- defer d.DecRef()
+ defer d.DecRef(ctx)
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -319,7 +319,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- d.DecRef()
+ d.DecRef(ctx)
d = next
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -330,7 +330,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- file.DecRef()
+ file.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
@@ -370,7 +370,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
b.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
@@ -380,14 +380,14 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var filePathBuilder strings.Builder
filePathBuilder.WriteByte('/')
// Create the mount point.
root := mntns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
Start: root,
@@ -403,7 +403,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to mount point: %v", err)
}
- defer mountPoint.DecRef()
+ defer mountPoint.DecRef(ctx)
// Create and mount the submount.
if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
b.Fatalf("failed to mount tmpfs submount: %v", err)
@@ -432,7 +432,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
if err != nil {
b.Fatalf("failed to walk to directory %q: %v", name, err)
}
- vd.DecRef()
+ vd.DecRef(ctx)
vd = nextVD
filePathBuilder.WriteString(name)
filePathBuilder.WriteByte('/')
@@ -448,11 +448,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
Mode: 0644,
})
- vd.DecRef()
+ vd.DecRef(ctx)
if err != nil {
b.Fatalf("failed to create file %q: %v", filename, err)
}
- fd.DecRef()
+ fd.DecRef(ctx)
filePathBuilder.WriteString(filename)
filePath := filePathBuilder.String()
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 83bf885ee..ac54d420d 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -29,7 +29,7 @@ type deviceFile struct {
minor uint32
}
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
file := &deviceFile{
kind: kind,
major: major,
@@ -43,7 +43,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
default:
panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
}
- file.inode.init(file, fs, creds, mode)
+ file.inode.init(file, fs, kuid, kgid, mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..78b4fc5be 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -48,9 +48,9 @@ type directory struct {
childList dentryList
}
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
dir := &directory{}
- dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
+ dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
dir.dentry.inode = &dir.inode
dir.dentry.vfsd.Init(&dir.dentry)
@@ -81,6 +81,10 @@ func (dir *directory) removeChildLocked(child *dentry) {
dir.iterMu.Unlock()
}
+func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
+ return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
+}
+
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
@@ -91,7 +95,7 @@ type directoryFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
if fd.iter != nil {
dir := fd.inode().impl.(*directory)
dir.iterMu.Lock()
@@ -106,6 +110,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
+ defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
// fs.mu is required to read d.parent and dentry.name.
fs.mu.RLock()
defer fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 80fa7b29d..a4864df53 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -40,7 +40,7 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions: filesystem.mu must be locked. !rp.Done().
-func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
dir, ok := d.inode.impl.(*directory)
if !ok {
return nil, syserror.ENOTDIR
@@ -55,13 +55,13 @@ afterSymlink:
return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
return d, nil
}
- if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
return nil, err
}
rp.Advance()
@@ -74,12 +74,12 @@ afterSymlink:
if !ok {
return nil, syserror.ENOENT
}
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// Symlink traversal updates access time.
- atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
+ child.inode.touchAtime(rp.Mount())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
@@ -98,9 +98,9 @@ afterSymlink:
// fs/namei.c:path_parentat().
//
// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
+func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
for !rp.Final() {
- next, err := stepLocked(rp, d)
+ next, err := stepLocked(ctx, rp, d)
if err != nil {
return nil, err
}
@@ -118,10 +118,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
//
// Preconditions: filesystem.mu must be locked.
-func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
d := rp.Start().Impl().(*dentry)
for !rp.Done() {
- next, err := stepLocked(rp, d)
+ next, err := stepLocked(ctx, rp, d)
if err != nil {
return nil, err
}
@@ -141,10 +141,10 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
//
// Preconditions: !rp.Done(). For the final path component in rp,
// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
if err := create(parentDir, name); err != nil {
return err
}
+
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
parentDir.inode.touchCMtime()
return nil
}
@@ -185,7 +191,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return err
}
@@ -196,7 +202,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -216,7 +222,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return nil, err
}
@@ -226,34 +232,40 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
d := vd.Dentry().Impl().(*dentry)
- if d.inode.isDir() {
+ i := d.inode
+ if i.isDir() {
return syserror.EPERM
}
- if d.inode.nlink == 0 {
+ if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+ return err
+ }
+ if i.nlink == 0 {
return syserror.ENOENT
}
- if d.inode.nlink == maxLinks {
+ if i.nlink == maxLinks {
return syserror.EMLINK
}
- d.inode.incLinksLocked()
- parentDir.insertChildLocked(fs.newDentry(d.inode), name)
+ i.incLinksLocked()
+ i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
+ parentDir.insertChildLocked(fs.newDentry(i), name)
return nil
})
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
if parentDir.inode.nlink == maxLinks {
return syserror.EMLINK
}
parentDir.inode.incLinksLocked() // from child's ".."
- childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+ childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
parentDir.insertChildLocked(&childDir.dentry, name)
return nil
})
@@ -261,19 +273,20 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
var childInode *inode
switch opts.Mode.FileType() {
- case 0, linux.S_IFREG:
- childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+ case linux.S_IFREG:
+ childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
case linux.S_IFIFO:
- childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+ childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
case linux.S_IFBLK:
- childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+ childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
case linux.S_IFCHR:
- childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+ childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
case linux.S_IFSOCK:
- childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
+ childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
default:
return syserror.EINVAL
}
@@ -295,7 +308,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -307,7 +320,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
fs.mu.Lock()
defer fs.mu.Unlock()
if rp.Done() {
- // Reject attempts to open directories with O_CREAT.
+ // Reject attempts to open mount root directory with O_CREAT.
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
@@ -317,7 +330,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
return start.open(ctx, rp, &opts, false /* afterCreate */)
}
afterTrailingSymlink:
- parentDir, err := walkParentDirLocked(rp, start)
+ parentDir, err := walkParentDirLocked(ctx, rp, start)
if err != nil {
return nil, err
}
@@ -348,23 +361,28 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ creds := rp.Credentials()
+ child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
parentDir.insertChildLocked(child, name)
fd, err := child.open(ctx, rp, &opts, true)
if err != nil {
return nil, err
}
+ parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
parentDir.inode.touchCMtime()
return fd, nil
}
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
// Is the file mounted over?
- if err := rp.CheckMount(&child.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
}
// Do we need to resolve a trailing symlink?
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// Symlink traversal updates access time.
- atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
+ child.inode.touchAtime(rp.Mount())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
@@ -388,10 +406,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
switch impl := d.inode.impl.(type) {
case *regularFile:
var fd regularFileFD
- if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ fd.LockFD.Init(&d.inode.locks)
+ if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
return nil, err
}
- if opts.Flags&linux.O_TRUNC != 0 {
+ if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
if _, err := impl.truncate(0); err != nil {
return nil, err
}
@@ -403,15 +422,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
return nil, syserror.EISDIR
}
var fd directoryFD
- if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ fd.LockFD.Init(&d.inode.locks)
+ if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
case *symlink:
- // Can't open symlinks without O_PATH (which is unimplemented).
+ // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
return nil, syserror.ELOOP
case *namedPipe:
- return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
+ return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
case *deviceFile:
return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
case *socketFile:
@@ -425,7 +445,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
@@ -447,7 +467,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Resolve newParent first to verify that it's on this Mount.
fs.mu.Lock()
defer fs.mu.Unlock()
- newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -472,6 +492,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !ok {
return syserror.ENOENT
}
+ if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
+ return err
+ }
// Note that we don't need to call rp.CheckMount(), since if renamed is a
// mount point then we want to rename the mount point, not anything in the
// mounted filesystem.
@@ -532,7 +555,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
var replacedVFSD *vfs.Dentry
if replaced != nil {
replacedVFSD = &replaced.vfsd
@@ -543,22 +566,26 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if replaced != nil {
newParentDir.removeChildLocked(replaced)
if replaced.inode.isDir() {
- newParentDir.inode.decLinksLocked() // from replaced's ".."
+ // Remove links for replaced/. and replaced/..
+ replaced.inode.decLinksLocked(ctx)
+ newParentDir.inode.decLinksLocked(ctx)
}
- replaced.inode.decLinksLocked()
+ replaced.inode.decLinksLocked(ctx)
}
oldParentDir.removeChildLocked(renamed)
newParentDir.insertChildLocked(renamed, newName)
- vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
oldParentDir.inode.touchCMtime()
if oldParentDir != newParentDir {
if renamed.inode.isDir() {
- oldParentDir.inode.decLinksLocked()
+ oldParentDir.inode.decLinksLocked(ctx)
newParentDir.inode.incLinksLocked()
}
newParentDir.inode.touchCMtime()
}
renamed.inode.touchCtime()
+
+ vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
return nil
}
@@ -566,7 +593,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -584,6 +611,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if !ok {
return syserror.ENOENT
}
+ if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
childDir, ok := child.inode.impl.(*directory)
if !ok {
return syserror.ENOTDIR
@@ -598,14 +628,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
parentDir.removeChildLocked(child)
- parentDir.inode.decLinksLocked() // from child's ".."
- child.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+ // Remove links for child, child/., and child/..
+ child.inode.decLinksLocked(ctx)
+ child.inode.decLinksLocked(ctx)
+ parentDir.inode.decLinksLocked(ctx)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
parentDir.inode.touchCMtime()
return nil
}
@@ -613,19 +646,28 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return err
}
- return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+ if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
+ fs.mu.RUnlock()
+ return err
+ }
+ fs.mu.RUnlock()
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return linux.Statx{}, err
}
@@ -638,7 +680,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- if _, err := resolveLocked(rp); err != nil {
+ if _, err := resolveLocked(ctx, rp); err != nil {
return linux.Statfs{}, err
}
statfs := linux.Statfs{
@@ -655,8 +697,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
- child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+ creds := rp.Credentials()
+ child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
parentDir.insertChildLocked(child, name)
return nil
})
@@ -666,7 +709,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -681,6 +724,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if !ok {
return syserror.ENOENT
}
+ if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
if child.inode.isDir() {
return syserror.EISDIR
}
@@ -694,13 +740,19 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
+
+ // Generate inotify events. Note that this must take place before the link
+ // count of the child is decremented, or else the watches may be dropped
+ // before these events are added.
+ vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
+
parentDir.removeChildLocked(child)
- child.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(&child.vfsd)
+ child.inode.decLinksLocked(ctx)
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
parentDir.inode.touchCMtime()
return nil
}
@@ -709,7 +761,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -728,7 +780,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
@@ -739,7 +791,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
@@ -749,23 +801,37 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
+ return err
+ }
+ if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+ fs.mu.RUnlock()
return err
}
- return d.inode.setxattr(rp.Credentials(), &opts)
+ fs.mu.RUnlock()
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
- defer fs.mu.RUnlock()
- d, err := resolveLocked(rp)
+ d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return err
}
- return d.inode.removexattr(rp.Credentials(), name)
+ if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+ fs.mu.RUnlock()
+ return err
+ }
+ fs.mu.RUnlock()
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 8d77b3fa8..739350cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -30,9 +30,9 @@ type namedPipe struct {
// Preconditions:
// * fs.mu must be locked.
// * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
- file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
+ file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
file.inode.nlink = 1 // Only the parent has a link.
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 1614f2c39..ec2701d8b 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -32,7 +32,7 @@ const fileName = "mypipe"
func TestSeparateFDs(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the read side. This is done in a concurrently because opening
// One end the pipe blocks until the other end is opened.
@@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer wfd.DecRef()
+ defer wfd.DecRef(ctx)
rfd, ok := <-rfdchan
if !ok {
t.Fatalf("failed to open pipe for reading %q", fileName)
}
- defer rfd.DecRef()
+ defer rfd.DecRef(ctx)
const msg = "vamos azul"
checkEmpty(ctx, t, rfd)
@@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) {
func TestNonblockingRead(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the read side as nonblocking.
pop := vfs.PathOperation{
@@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
}
- defer rfd.DecRef()
+ defer rfd.DecRef(ctx)
// Open the write side.
openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
@@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer wfd.DecRef()
+ defer wfd.DecRef(ctx)
const msg = "geh blau"
checkEmpty(ctx, t, rfd)
@@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) {
func TestNonblockingWriteError(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the write side as nonblocking, which should return ENXIO.
pop := vfs.PathOperation{
@@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) {
func TestSingleFD(t *testing.T) {
ctx, creds, vfsObj, root := setup(t)
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Open the pipe as readable and writable.
pop := vfs.PathOperation{
@@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) {
if err != nil {
t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
}
- defer fd.DecRef()
+ defer fd.DecRef(ctx)
const msg = "forza blu"
checkEmpty(ctx, t, fd)
@@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
// Create VFS.
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 3f433d666..0710b65db 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -85,12 +84,12 @@ type regularFile struct {
size uint64
}
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &regularFile{
memFile: fs.memFile,
seals: linux.F_SEAL_SEAL,
}
- file.inode.init(file, fs, creds, linux.S_IFREG|mode)
+ file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
@@ -271,15 +270,39 @@ type regularFileFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
+func (fd *regularFileFD) Release(context.Context) {
// noop
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ f := fd.inode().impl.(*regularFile)
+
+ f.inode.mu.Lock()
+ defer f.inode.mu.Unlock()
+ oldSize := f.size
+ size := offset + length
+ if oldSize >= size {
+ return nil
+ }
+ _, err := f.truncateLocked(size)
+ return err
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if offset < 0 {
return 0, syserror.EINVAL
}
+
+ // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+ // all state is in-memory.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
if dst.NumBytes() == 0 {
return 0, nil
}
@@ -302,40 +325,60 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
if offset < 0 {
- return 0, syserror.EINVAL
+ return 0, offset, syserror.EINVAL
}
+
+ // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+ // all state is in-memory.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
srclen := src.NumBytes()
if srclen == 0 {
- return 0, nil
+ return 0, offset, nil
}
f := fd.inode().impl.(*regularFile)
+ f.inode.mu.Lock()
+ defer f.inode.mu.Unlock()
+ // If the file is opened with O_APPEND, update offset to file size.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Locking f.inode.mu is sufficient for reading f.size.
+ offset = int64(f.size)
+ }
if end := offset + srclen; end < offset {
// Overflow.
- return 0, syserror.EFBIG
+ return 0, offset, syserror.EINVAL
}
- var err error
srclen, err = vfs.CheckLimit(ctx, offset, srclen)
if err != nil {
- return 0, err
+ return 0, offset, err
}
src = src.TakeFirst64(srclen)
- f.inode.mu.Lock()
rw := getRegularFileReadWriter(f, offset)
n, err := src.CopyInTo(ctx, rw)
- fd.inode().touchCMtimeLocked()
- f.inode.mu.Unlock()
+ f.inode.touchCMtimeLocked()
putRegularFileReadWriter(rw)
- return n, err
+ return n, n + offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
fd.offMu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
fd.offMu.Unlock()
return n, err
}
@@ -361,33 +404,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (
return offset, nil
}
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
- return nil
-}
-
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
-func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
- return fd.inode().lockBSD(uid, t, block)
-}
-
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
-func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
- fd.inode().unlockBSD(uid)
- return nil
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
- return fd.inode().lockPOSIX(uid, t, rng, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
- fd.inode().unlockPOSIX(uid, rng)
- return nil
-}
-
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
file := fd.inode().impl.(*regularFile)
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 64e1c40ad..146c7fdfe 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -138,48 +138,37 @@ func TestLocks(t *testing.T) {
}
defer cleanup()
- var (
- uid1 lock.UniqueID
- uid2 lock.UniqueID
- // Non-blocking.
- block lock.Blocker
- )
-
- uid1 = 123
- uid2 = 456
-
- if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil {
+ uid1 := 123
+ uid2 := 456
+ if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil {
+ if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
}
if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
}
- if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil {
+ if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- rng1 := lock.LockRange{0, 1}
- rng2 := lock.LockRange{1, 2}
-
- if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil {
+ if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
}
- if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil {
+ if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil {
t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
}
}
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 25c2321af..3ed650474 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -26,9 +26,9 @@ type socketFile struct {
ep transport.BoundEndpoint
}
-func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
file := &socketFile{ep: ep}
- file.inode.init(file, fs, creds, mode)
+ file.inode.init(file, fs, kuid, kgid, mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 47e075ed4..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -24,11 +24,11 @@ type symlink struct {
target string // immutable
}
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
link := &symlink{
target: target,
}
- link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
+ link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
link.inode.nlink = 1 // from parent directory
return &link.inode
}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1e781aecd..de2af6d01 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,6 +30,7 @@ package tmpfs
import (
"fmt"
"math"
+ "strconv"
"strings"
"sync/atomic"
@@ -40,7 +41,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/sentry/vfs/lock"
"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -112,6 +112,58 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ rootMode := linux.FileMode(0777)
+ if rootFileType == linux.S_IFDIR {
+ rootMode = 01777
+ }
+ modeStr, ok := mopts["mode"]
+ if ok {
+ delete(mopts, "mode")
+ mode, err := strconv.ParseUint(modeStr, 8, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+ return nil, nil, syserror.EINVAL
+ }
+ rootMode = linux.FileMode(mode & 07777)
+ }
+ rootKUID := creds.EffectiveKUID
+ uidStr, ok := mopts["uid"]
+ if ok {
+ delete(mopts, "uid")
+ uid, err := strconv.ParseUint(uidStr, 10, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+ return nil, nil, syserror.EINVAL
+ }
+ kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+ if !kuid.Ok() {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+ return nil, nil, syserror.EINVAL
+ }
+ rootKUID = kuid
+ }
+ rootKGID := creds.EffectiveKGID
+ gidStr, ok := mopts["gid"]
+ if ok {
+ delete(mopts, "gid")
+ gid, err := strconv.ParseUint(gidStr, 10, 32)
+ if err != nil {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+ return nil, nil, syserror.EINVAL
+ }
+ kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+ if !kgid.Ok() {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+ return nil, nil, syserror.EINVAL
+ }
+ rootKGID = kgid
+ }
+ if len(mopts) != 0 {
+ ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
devMinor, err := vfsObj.GetAnonBlockDevMinor()
if err != nil {
return nil, nil, err
@@ -127,13 +179,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var root *dentry
switch rootFileType {
case linux.S_IFREG:
- root = fs.newDentry(fs.newRegularFile(creds, 0777))
+ root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
case linux.S_IFLNK:
- root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
+ root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
case linux.S_IFDIR:
- root = &fs.newDirectory(creds, 01777).dentry
+ root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
default:
- fs.vfsfs.DecRef()
+ fs.vfsfs.DecRef(ctx)
return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
}
return &fs.vfsfs, &root.vfsd, nil
@@ -145,7 +197,7 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}
@@ -197,24 +249,46 @@ func (d *dentry) TryIncRef() bool {
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
- d.inode.decRef()
+func (d *dentry) DecRef(ctx context.Context) {
+ d.inode.decRef(ctx)
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ if d.inode.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+ // that d was deleted.
+ deleted := d.vfsd.IsDead()
+
+ d.inode.fs.mu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+ }
+ d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
+ d.inode.fs.mu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.inode.watches
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {}
+
// inode represents a filesystem object.
type inode struct {
// fs is the owning filesystem. fs is immutable.
fs *filesystem
- // refs is a reference count. refs is accessed using atomic memory
- // operations.
- //
- // A reference is held on all inodes that are reachable in the filesystem
- // tree. For non-directories (which may have multiple hard links), this
- // means that a reference is dropped when nlink reaches 0. For directories,
- // nlink never reaches 0 due to the "." entry; instead,
- // filesystem.RmdirAt() drops the reference.
- refs int64
+ // A reference is held on all inodes as long as they are reachable in the
+ // filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+ // nlink reaches 0.
+ refs inodeRefs
// xattrs implements extended attributes.
//
@@ -235,23 +309,24 @@ type inode struct {
ctime int64 // nanoseconds
mtime int64 // nanoseconds
- // Advisory file locks, which lock at the inode level.
- locks lock.FileLocks
+ locks vfs.FileLocks
+
+ // Inotify watches for this inode.
+ watches vfs.Watches
impl interface{} // immutable
}
const maxLinks = math.MaxUint32
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
if mode.FileType() == 0 {
panic("file type is required in FileMode")
}
i.fs = fs
- i.refs = 1
i.mode = uint32(mode)
- i.uid = uint32(creds.EffectiveKUID)
- i.gid = uint32(creds.EffectiveKGID)
+ i.uid = uint32(kuid)
+ i.gid = uint32(kgid)
i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
// Tmpfs creation sets atime, ctime, and mtime to current time.
now := fs.clock.Now().Nanoseconds()
@@ -260,6 +335,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
i.mtime = now
// i.nlink initialized by caller
i.impl = impl
+ i.refs.EnableLeakCheck()
}
// incLinksLocked increments i's link count.
@@ -276,45 +352,37 @@ func (i *inode) incLinksLocked() {
atomic.AddUint32(&i.nlink, 1)
}
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
//
// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-func (i *inode) decLinksLocked() {
+func (i *inode) decLinksLocked(ctx context.Context) {
if i.nlink == 0 {
panic("tmpfs.inode.decLinksLocked() called with no existing links")
}
- atomic.AddUint32(&i.nlink, ^uint32(0))
+ if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+ i.decRef(ctx)
+ }
}
func (i *inode) incRef() {
- if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("tmpfs.inode.incRef() called without holding a reference")
- }
+ i.refs.IncRef()
}
func (i *inode) tryIncRef() bool {
- for {
- refs := atomic.LoadInt64(&i.refs)
- if refs == 0 {
- return false
- }
- if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
- return true
- }
- }
+ return i.refs.TryIncRef()
}
-func (i *inode) decRef() {
- if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+func (i *inode) decRef(ctx context.Context) {
+ i.refs.DecRef(func() {
+ i.watches.HandleDeletion(ctx)
if regFile, ok := i.impl.(*regularFile); ok {
// Release memory used by regFile to store data. Since regFile is
// no longer usable, we don't need to grab any locks or update any
// metadata.
regFile.data.DropAll(regFile.memFile)
}
- } else if refs < 0 {
- panic("tmpfs.inode.decRef() called without holding a reference")
- }
+ })
}
func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -369,7 +437,8 @@ func (i *inode) statTo(stat *linux.Statx) {
}
}
-func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
+ stat := &opts.Stat
if stat.Mask == 0 {
return nil
}
@@ -377,7 +446,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
return syserror.EPERM
}
mode := linux.FileMode(atomic.LoadUint32(&i.mode))
- if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+ if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
return err
}
i.mu.Lock()
@@ -455,44 +524,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
return nil
}
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
- switch i.impl.(type) {
- case *regularFile:
- return i.locks.LockBSD(uid, t, block)
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockBSD(uid fslock.UniqueID) error {
- switch i.impl.(type) {
- case *regularFile:
- i.locks.UnlockBSD(uid)
- return nil
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
- switch i.impl.(type) {
- case *regularFile:
- return i.locks.LockPOSIX(uid, t, rng, block)
- }
- return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
- switch i.impl.(type) {
- case *regularFile:
- i.locks.UnlockPOSIX(uid, rng)
- return nil
- }
- return syserror.EBADF
-}
-
// allocatedBlocksForSize returns the number of 512B blocks needed to
// accommodate the given size in bytes, as appropriate for struct
// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -512,6 +543,8 @@ func (i *inode) direntType() uint8 {
return linux.DT_LNK
case *socketFile:
return linux.DT_SOCK
+ case *namedPipe:
+ return linux.DT_FIFO
case *deviceFile:
switch impl.kind {
case vfs.BlockDevice:
@@ -531,6 +564,9 @@ func (i *inode) isDir() bool {
}
func (i *inode) touchAtime(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime {
+ return
+ }
if err := mnt.CheckBeginWrite(); err != nil {
return
}
@@ -621,14 +657,19 @@ func (i *inode) userXattrSupported() bool {
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
}
func (fd *fileDescription) filesystem() *filesystem {
return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
func (fd *fileDescription) inode() *inode {
- return fd.vfsfd.Dentry().Impl().(*dentry).inode
+ return fd.dentry().inode
}
// Stat implements vfs.FileDescriptionImpl.Stat.
@@ -641,7 +682,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
- return fd.inode().setStat(ctx, creds, &opts.Stat)
+ d := fd.dentry()
+ if err := d.inode.setStat(ctx, creds, &opts); err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
}
// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -656,17 +705,31 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
- return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+ d := fd.dentry()
+ if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
- return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+ d := fd.dentry()
+ if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
}
// NewMemfd creates a new tmpfs regular file and file description that can back
// an anonymous fd created by memfd_create.
-func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) {
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
fs, ok := mount.Filesystem().Impl().(*filesystem)
if !ok {
panic("NewMemfd() called with non-tmpfs mount")
@@ -674,23 +737,39 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
// S_IRWXUGO.
- mode := linux.FileMode(0777)
- inode := fs.newRegularFile(creds, mode)
+ inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
rf := inode.impl.(*regularFile)
if allowSeals {
rf.seals = 0
}
d := fs.newDentry(inode)
- defer d.DecRef()
+ defer d.DecRef(ctx)
d.name = name
// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
// FMODE_READ | FMODE_WRITE.
var fd regularFileFD
+ fd.Init(&inode.locks)
flags := uint32(linux.O_RDWR)
if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
+// filesystem state is in-memory.
+func (*fileDescription) Sync(context.Context) error {
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index a240fb276..6f3e3ae6f 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -34,7 +34,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
creds := auth.CredentialsFromContext(ctx)
vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
}
@@ -47,8 +47,8 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
}
root := mntns.Root()
return vfsObj, root, func() {
- root.DecRef()
- mntns.DecRef()
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
}, nil
}
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 2916a0644..c0b4831d1 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -56,6 +56,12 @@ type Stack interface {
// settings.
SetTCPSACKEnabled(enabled bool) error
+ // TCPRecovery returns the TCP loss detection algorithm.
+ TCPRecovery() (TCPLossRecovery, error)
+
+ // SetTCPRecovery attempts to change TCP loss detection algorithm.
+ SetTCPRecovery(recovery TCPLossRecovery) error
+
// Statistics reports stack statistics.
Statistics(stat interface{}, arg string) error
@@ -189,3 +195,14 @@ type StatSNMPUDP [8]uint64
// StatSNMPUDPLite describes UdpLite line of /proc/net/snmp.
type StatSNMPUDPLite [8]uint64
+
+// TCPLossRecovery indicates TCP loss detection and recovery methods to use.
+type TCPLossRecovery int32
+
+// Loss recovery constants from include/net/tcp.h which are used to set
+// /proc/sys/net/ipv4/tcp_recovery.
+const (
+ TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota
+ TCP_RACK_STATIC_REO_WND
+ TCP_RACK_NO_DUPTHRESH
+)
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index d8961fc94..9771f01fc 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -25,6 +25,7 @@ type TestStack struct {
TCPRecvBufSize TCPBufferSize
TCPSendBufSize TCPBufferSize
TCPSACKFlag bool
+ Recovery TCPLossRecovery
}
// NewTestStack returns a TestStack with no network interfaces. The value of
@@ -91,6 +92,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
return nil
}
+// TCPRecovery implements Stack.TCPRecovery.
+func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) {
+ return s.Recovery, nil
+}
+
+// SetTCPRecovery implements Stack.SetTCPRecovery.
+func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error {
+ s.Recovery = recovery
+ return nil
+}
+
// Statistics implements inet.Stack.Statistics.
func (s *TestStack) Statistics(stat interface{}, arg string) error {
return nil
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a28eab8b8..f6886a758 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -85,6 +85,7 @@ go_library(
name = "kernel",
srcs = [
"abstract_socket_namespace.go",
+ "aio.go",
"context.go",
"fd_table.go",
"fd_table_unsafe.go",
@@ -131,6 +132,7 @@ go_library(
"task_stop.go",
"task_syscall.go",
"task_usermem.go",
+ "task_work.go",
"thread_group.go",
"threads.go",
"timekeeper.go",
@@ -199,6 +201,7 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/state",
"//pkg/state/statefile",
+ "//pkg/state/wire",
"//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 920fe4329..52ed5cea2 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -17,6 +17,7 @@ package kernel
import (
"syscall"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sync"
@@ -31,7 +32,7 @@ type abstractEndpoint struct {
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (e *abstractEndpoint) WeakRefGone() {
+func (e *abstractEndpoint) WeakRefGone(context.Context) {
e.ns.mu.Lock()
if e.ns.endpoints[e.name].ep == e.ep {
delete(e.ns.endpoints, e.name)
@@ -64,9 +65,9 @@ type boundEndpoint struct {
}
// Release implements transport.BoundEndpoint.Release.
-func (e *boundEndpoint) Release() {
- e.rc.DecRef()
- e.BoundEndpoint.Release()
+func (e *boundEndpoint) Release(ctx context.Context) {
+ e.rc.DecRef(ctx)
+ e.BoundEndpoint.Release(ctx)
}
// BoundEndpoint retrieves the endpoint bound to the given name. The return
@@ -93,13 +94,13 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
//
// When the last reference managed by rc is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
if ep, ok := a.endpoints[name]; ok {
if rc := ep.wr.Get(); rc != nil {
- rc.DecRef()
+ rc.DecRef(ctx)
return syscall.EADDRINUSE
}
}
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
new file mode 100644
index 000000000..0ac78c0b8
--- /dev/null
+++ b/pkg/sentry/kernel/aio.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+)
+
+// AIOCallback is an function that does asynchronous I/O on behalf of a task.
+type AIOCallback func(context.Context)
+
+// QueueAIO queues an AIOCallback which will be run asynchronously.
+func (t *Task) QueueAIO(cb AIOCallback) {
+ ctx := taskAsyncContext{t: t}
+ wg := &t.TaskSet().aioGoroutines
+ wg.Add(1)
+ go func() {
+ cb(ctx)
+ wg.Done()
+ }()
+}
+
+type taskAsyncContext struct {
+ context.NoopSleeper
+ t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+ ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+ ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+ ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+ return ctx.t.IsLogging(level)
+}
+
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+ return ctx.t.Deadline()
+}
+
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+ return ctx.t.Done()
+}
+
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+ return ctx.t.Err()
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+ return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index e057d2c6d..6862f2ef5 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
}
return NoID, syserror.EPERM
}
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+ kuid := c.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+ c.RealKUID = kuid
+ c.EffectiveKUID = kuid
+ c.SavedKUID = kuid
+ return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+ kgid := c.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ c.RealKGID = kgid
+ c.EffectiveKGID = kgid
+ c.SavedKGID = kgid
+ return nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 0c40bf315..dd5f0f5fa 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -18,7 +18,6 @@ import (
"time"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
)
// contextID is the kernel package's type for context.Context.Value keys.
@@ -113,55 +112,3 @@ func (*Task) Done() <-chan struct{} {
func (*Task) Err() error {
return nil
}
-
-// AsyncContext returns a context.Context that may be used by goroutines that
-// do work on behalf of t and therefore share its contextual values, but are
-// not t's task goroutine (e.g. asynchronous I/O).
-func (t *Task) AsyncContext() context.Context {
- return taskAsyncContext{t: t}
-}
-
-type taskAsyncContext struct {
- context.NoopSleeper
- t *Task
-}
-
-// Debugf implements log.Logger.Debugf.
-func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
- ctx.t.Debugf(format, v...)
-}
-
-// Infof implements log.Logger.Infof.
-func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
- ctx.t.Infof(format, v...)
-}
-
-// Warningf implements log.Logger.Warningf.
-func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
- ctx.t.Warningf(format, v...)
-}
-
-// IsLogging implements log.Logger.IsLogging.
-func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
- return ctx.t.IsLogging(level)
-}
-
-// Deadline implements context.Context.Deadline.
-func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
- return ctx.t.Deadline()
-}
-
-// Done implements context.Context.Done.
-func (ctx taskAsyncContext) Done() <-chan struct{} {
- return ctx.t.Done()
-}
-
-// Err implements context.Context.Err.
-func (ctx taskAsyncContext) Err() error {
- return ctx.t.Err()
-}
-
-// Value implements context.Context.Value.
-func (ctx taskAsyncContext) Value(key interface{}) interface{} {
- return ctx.t.Value(key)
-}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 3d78cd48f..15519f0df 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -76,8 +76,8 @@ type pollEntry struct {
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
// weakReferenceGone is called when the file in the weak reference is destroyed.
// The poll entry is removed in response to this.
-func (p *pollEntry) WeakRefGone() {
- p.epoll.RemoveEntry(p.id)
+func (p *pollEntry) WeakRefGone(ctx context.Context) {
+ p.epoll.RemoveEntry(ctx, p.id)
}
// EventPoll holds all the state associated with an event poll object, that is,
@@ -107,7 +107,7 @@ type EventPoll struct {
// different lock to avoid circular lock acquisition order involving
// the wait queue mutexes and mu. The full order is mu, observed file
// wait queue mutex, then listsMu; this allows listsMu to be acquired
- // when readyCallback is called.
+ // when (*pollEntry).Callback is called.
//
// An entry is always in one of the following lists:
// readyList -- when there's a chance that it's ready to have
@@ -116,7 +116,7 @@ type EventPoll struct {
// readEvents() functions always call the entry's file
// Readiness() function to confirm it's ready.
// waitingList -- when there's no chance that the entry is ready,
- // so it's waiting for the readyCallback to be called
+ // so it's waiting for the (*pollEntry).Callback to be called
// on it before it gets moved to the readyList.
// disabledList -- when the entry is disabled. This happens when
// a one-shot entry gets delivered via readEvents().
@@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File {
// name matches fs/eventpoll.c:epoll_create1.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
files: make(map[FileIdentifier]*pollEntry),
})
}
// Release implements fs.FileOperations.Release.
-func (e *EventPoll) Release() {
+func (e *EventPoll) Release(ctx context.Context) {
// We need to take the lock now because files may be attempting to
// remove entries in parallel if they get destroyed.
e.mu.Lock()
@@ -160,7 +160,7 @@ func (e *EventPoll) Release() {
// Go through all entries and clean up.
for _, entry := range e.files {
entry.id.File.EventUnregister(&entry.waiter)
- entry.file.Drop()
+ entry.file.Drop(ctx)
}
e.files = nil
}
@@ -269,21 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
return ret
}
-// readyCallback is called when one of the files we're polling becomes ready. It
-// moves said file to the readyList if it's currently in the waiting list.
-type readyCallback struct{}
-
// Callback implements waiter.EntryCallback.Callback.
-func (*readyCallback) Callback(w *waiter.Entry) {
- entry := w.Context.(*pollEntry)
- e := entry.epoll
+//
+// Callback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+func (p *pollEntry) Callback(*waiter.Entry) {
+ e := p.epoll
e.listsMu.Lock()
- if entry.curList == &e.waitingList {
- e.waitingList.Remove(entry)
- e.readyList.PushBack(entry)
- entry.curList = &e.readyList
+ if p.curList == &e.waitingList {
+ e.waitingList.Remove(p)
+ e.readyList.PushBack(p)
+ p.curList = &e.readyList
e.listsMu.Unlock()
e.Notify(waiter.EventIn)
@@ -310,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
// Check if the file happens to already be in a ready state.
ready := f.Readiness(entry.mask) & entry.mask
if ready != 0 {
- (*readyCallback).Callback(nil, &entry.waiter)
+ entry.Callback(&entry.waiter)
}
}
@@ -380,10 +378,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev
userData: data,
epoll: e,
flags: flags,
- waiter: waiter.Entry{Callback: &readyCallback{}},
mask: mask,
}
- entry.waiter.Context = entry
+ entry.waiter.Callback = entry
e.files[id] = entry
entry.file = refs.NewWeakRef(id.File, entry)
@@ -406,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
}
// Unregister the old mask and remove entry from the list it's in, so
- // readyCallback is guaranteed to not be called on this entry anymore.
+ // (*pollEntry).Callback is guaranteed to not be called on this entry anymore.
entry.id.File.EventUnregister(&entry.waiter)
// Remove entry from whatever list it's in. This ensure that no other
@@ -426,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter
}
// RemoveEntry a files from the collection of observed files.
-func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error {
e.mu.Lock()
defer e.mu.Unlock()
@@ -448,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
// Remove file from map, and drop weak reference.
delete(e.files, id)
- entry.file.Drop()
+ entry.file.Drop(ctx)
return nil
}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index 8e9f200d0..7c61e0258 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -21,8 +21,7 @@ import (
// afterLoad is invoked by stateify.
func (p *pollEntry) afterLoad() {
- p.waiter = waiter.Entry{Callback: &readyCallback{}}
- p.waiter.Context = p
+ p.waiter.Callback = p
p.file = refs.NewWeakRef(p.id.File, p)
p.id.File.EventRegister(&p.waiter, p.mask)
}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index 22630e9c5..55b505593 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) {
f := filetest.NewTestFile(t)
id := FileIdentifier{f, 12}
- efile := NewEventPoll(contexttest.Context(t))
+ ctx := contexttest.Context(t)
+ efile := NewEventPoll(ctx)
e := efile.FileOperations.(*EventPoll)
if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
t.Fatalf("addEntry failed: %v", err)
@@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) {
}
// Destroy the file. Check that we get no more events.
- f.DecRef()
+ f.DecRef(ctx)
evt = e.ReadEvents(1)
if len(evt) != 0 {
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 87951adeb..bbf568dfc 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
// name matches fs/eventfd.c:eventfd_file_create.
dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]")
// Release the initial dirent reference after NewFile takes a reference.
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
val: initVal,
semMode: semMode,
@@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) {
}
// Release implements fs.FileOperations.Release.
-func (e *EventOperations) Release() {
+func (e *EventOperations) Release(context.Context) {
e.mu.Lock()
defer e.mu.Unlock()
if e.hostfd >= 0 {
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index b9126e946..2b3955598 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -11,6 +11,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index d32c3e90a..153d2cd9b 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -20,15 +20,21 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
-// New creates a new FileAsync.
+// New creates a new fs.FileAsync.
func New() fs.FileAsync {
return &FileAsync{}
}
+// NewVFS2 creates a new vfs.FileAsync.
+func NewVFS2() vfs.FileAsync {
+ return &FileAsync{}
+}
+
// FileAsync sends signals when the registered file is ready for IO.
//
// +stateify savable
@@ -170,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern
a.recipientTG = nil
a.recipientPG = recipient
}
+
+// ClearOwner unsets the current signal recipient.
+func (a *FileAsync) ClearOwner() {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ a.requester = nil
+ a.recipientT = nil
+ a.recipientTG = nil
+ a.recipientPG = nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ed40b5303..ce53af69b 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -29,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// FDFlags define flags for an individual descriptor.
@@ -80,9 +81,6 @@ type FDTable struct {
refs.AtomicRefCount
k *Kernel
- // uid is a unique identifier.
- uid uint64
-
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -100,7 +98,7 @@ type FDTable struct {
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
file: file,
fileVFS2: fileVFS2,
@@ -111,6 +109,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
}
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
+ ctx := context.Background()
f.init() // Initialize table.
for fd, d := range m {
f.setAll(fd, d.file, d.fileVFS2, d.flags)
@@ -120,9 +119,9 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
// reference taken by set above.
switch {
case d.file != nil:
- d.file.DecRef()
+ d.file.DecRef(ctx)
case d.fileVFS2 != nil:
- d.fileVFS2.DecRef()
+ d.fileVFS2.DecRef(ctx)
}
}
}
@@ -130,7 +129,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
// drop drops the table reference.
func (f *FDTable) drop(file *fs.File) {
// Release locks.
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
// Send inotify events.
d := file.Dirent
@@ -146,43 +145,47 @@ func (f *FDTable) drop(file *fs.File) {
d.InotifyEvent(ev, 0)
// Drop the table reference.
- file.DecRef()
+ file.DecRef(context.Background())
}
// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
- // TODO(gvisor.dev/issue/1480): Release locks.
- // TODO(gvisor.dev/issue/1479): Send inotify events.
+ // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
+ // entire file.
+ ctx := context.Background()
+ err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
+ if err != nil && err != syserror.ENOLCK {
+ panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+ }
- // Drop the table reference.
- file.DecRef()
-}
+ // Generate inotify events.
+ ev := uint32(linux.IN_CLOSE_NOWRITE)
+ if file.IsWritable() {
+ ev = linux.IN_CLOSE_WRITE
+ }
+ file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent)
-// ID returns a unique identifier for this FDTable.
-func (f *FDTable) ID() uint64 {
- return f.uid
+ // Drop the table's reference.
+ file.DecRef(ctx)
}
// NewFDTable allocates a new FDTable that may be used by tasks in k.
func (k *Kernel) NewFDTable() *FDTable {
- f := &FDTable{
- k: k,
- uid: atomic.AddUint64(&k.fdMapUids, 1),
- }
+ f := &FDTable{k: k}
f.init()
return f
}
// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy() {
- f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+func (f *FDTable) destroy(ctx context.Context) {
+ f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
return true
})
}
// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FDTable) DecRef() {
- f.DecRefWithDestructor(f.destroy)
+func (f *FDTable) DecRef(ctx context.Context) {
+ f.DecRefWithDestructor(ctx, f.destroy)
}
// Size returns the number of file descriptor slots currently allocated.
@@ -194,7 +197,7 @@ func (f *FDTable) Size() int {
// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
// retries tracks the number of failed TryIncRef attempts for the same FD.
retries := 0
fd := int32(0)
@@ -213,7 +216,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, file, nil, flags)
- file.DecRef()
+ file.DecRef(ctx)
case fileVFS2 != nil:
if !fileVFS2.TryIncRef() {
retries++
@@ -223,7 +226,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
continue // Race caught.
}
fn(fd, nil, fileVFS2, flags)
- fileVFS2.DecRef()
+ fileVFS2.DecRef(ctx)
}
retries = 0
fd++
@@ -233,7 +236,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
// String is a stringer for FDTable.
func (f *FDTable) String() string {
var buf strings.Builder
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ ctx := context.Background()
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
switch {
case file != nil:
n, _ := file.Dirent.FullName(nil /* root */)
@@ -241,7 +245,7 @@ func (f *FDTable) String() string {
case fileVFS2 != nil:
vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
- name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+ name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
if err != nil {
fmt.Fprintf(&buf, "<err: %v>\n", err)
return
@@ -463,6 +467,29 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
return nil
}
+// SetFlagsVFS2 sets the flags for the given file descriptor.
+//
+// True is returned iff flags were changed.
+func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ file, _, _ := f.getVFS2(fd)
+ if file == nil {
+ // No file found.
+ return syscall.EBADF
+ }
+
+ // Update the flags.
+ f.setVFS2(fd, file, flags)
+ return nil
+}
+
// Get returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
@@ -517,9 +544,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
//
// Precondition: The caller must be running on the task goroutine, or Task.mu
// must be locked.
-func (f *FDTable) GetFDs() []int32 {
+func (f *FDTable) GetFDs(ctx context.Context) []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
- f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
@@ -528,9 +555,9 @@ func (f *FDTable) GetFDs() []int32 {
// GetRefs returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
-func (f *FDTable) GetRefs() []*fs.File {
+func (f *FDTable) GetRefs(ctx context.Context) []*fs.File {
files := make([]*fs.File, 0, f.Size())
- f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
@@ -540,9 +567,9 @@ func (f *FDTable) GetRefs() []*fs.File {
// GetRefsVFS2 returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
-func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
+func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription {
files := make([]*vfs.FileDescription, 0, f.Size())
- f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
+ f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
@@ -550,10 +577,10 @@ func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
}
// Fork returns an independent FDTable.
-func (f *FDTable) Fork() *FDTable {
+func (f *FDTable) Fork(ctx context.Context) *FDTable {
clone := f.k.NewFDTable()
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
switch {
@@ -598,11 +625,11 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
}
// RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
+func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
f.mu.Lock()
defer f.mu.Unlock()
- f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
f.set(fd, nil, FDFlags{}) // Clear from table.
// Update current available position.
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 29f95a2c4..e3f30ba2a 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -154,7 +154,7 @@ func TestFDTable(t *testing.T) {
if ref == nil {
t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
}
- ref.DecRef()
+ ref.DecRef(ctx)
if ref, _ := fdTable.Remove(1); ref != nil {
t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
@@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) {
b.StartTimer() // Benchmark.
for i := 0; i < b.N; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
})
}
@@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) {
defer wg.Done()
for i := 0; i < each; i++ {
tf, _ := fdTable.Get(fds[i%len(fds)])
- tf.DecRef()
+ tf.DecRef(ctx)
}
}()
}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 47f78df9a..8f2d36d5a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -17,6 +17,7 @@ package kernel
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -89,28 +90,28 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
// Note that there may still be calls to WorkingDirectory() or RootDirectory()
// (that return nil). This is because valid references may still be held via
// proc files or other mechanisms.
-func (f *FSContext) destroy() {
+func (f *FSContext) destroy(ctx context.Context) {
// Hold f.mu so that we don't race with RootDirectory() and
// WorkingDirectory().
f.mu.Lock()
defer f.mu.Unlock()
if VFS2Enabled {
- f.rootVFS2.DecRef()
+ f.rootVFS2.DecRef(ctx)
f.rootVFS2 = vfs.VirtualDentry{}
- f.cwdVFS2.DecRef()
+ f.cwdVFS2.DecRef(ctx)
f.cwdVFS2 = vfs.VirtualDentry{}
} else {
- f.root.DecRef()
+ f.root.DecRef(ctx)
f.root = nil
- f.cwd.DecRef()
+ f.cwd.DecRef(ctx)
f.cwd = nil
}
}
// DecRef implements RefCounter.DecRef with destructor f.destroy.
-func (f *FSContext) DecRef() {
- f.DecRefWithDestructor(f.destroy)
+func (f *FSContext) DecRef(ctx context.Context) {
+ f.DecRefWithDestructor(ctx, f.destroy)
}
// Fork forks this FSContext.
@@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
// This will take an extra reference on the Dirent.
//
// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetWorkingDirectory called with nil dirent")
}
@@ -180,21 +181,21 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
old := f.cwd
f.cwd = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetWorkingDirectoryVFS2 sets the current working directory.
// This will take an extra reference on the VirtualDentry.
//
// This is not a valid call after destroy.
-func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
f.mu.Lock()
defer f.mu.Unlock()
old := f.cwdVFS2
f.cwdVFS2 = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// RootDirectory returns the current filesystem root.
@@ -226,7 +227,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
// This will take an extra reference on the Dirent.
//
// This is not a valid call after free.
-func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
if d == nil {
panic("FSContext.SetRootDirectory called with nil dirent")
}
@@ -241,13 +242,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
old := f.root
f.root = d
d.IncRef()
- old.DecRef()
+ old.DecRef(ctx)
}
// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
// This is not a valid call after free.
-func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
if !vd.Ok() {
panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
}
@@ -263,7 +264,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
vd.IncRef()
f.rootVFS2 = vd
f.mu.Unlock()
- old.DecRef()
+ old.DecRef(ctx)
}
// Umask returns the current umask.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index c5021f2db..daa2dae76 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -51,6 +51,7 @@ go_test(
srcs = ["futex_test.go"],
library = ":futex",
deps = [
+ "//pkg/context",
"//pkg/sync",
"//pkg/usermem",
],
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 732e66da4..e4dcc4d40 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -19,6 +19,7 @@ package futex
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -66,9 +67,9 @@ type Key struct {
Offset uint64
}
-func (k *Key) release() {
+func (k *Key) release(t Target) {
if k.MappingIdentity != nil {
- k.MappingIdentity.DecRef()
+ k.MappingIdentity.DecRef(t)
}
k.Mappable = nil
k.MappingIdentity = nil
@@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool {
// Target abstracts memory accesses and keys.
type Target interface {
+ context.Context
+
// SwapUint32 gives access to usermem.IO.SwapUint32.
SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
@@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) {
// bucket "to".
//
// Preconditions: b and to must be locked.
-func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
+func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
done := 0
for w := b.waiters.Front(); done < n && w != nil; {
if !w.key.matches(key) {
@@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
requeued := w
w = w.Next() // Next iteration.
b.waiters.Remove(requeued)
- requeued.key.release()
+ requeued.key.release(t)
requeued.key = nkey.clone()
to.waiters.PushBack(requeued)
requeued.bucket.Store(to)
@@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32
r := b.wakeLocked(&k, bitmask, n)
b.mu.Unlock()
- k.release()
+ k.release(t)
return r, nil
}
@@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, naddr, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch
done := b1.wakeLocked(&k1, ^uint32(0), nwake)
// Requeue the number required.
- b1.requeueLocked(b2, &k1, &k2, nreq)
+ b1.requeueLocked(t, b2, &k1, &k2, nreq)
return done, nil
}
@@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak
if err != nil {
return 0, err
}
- defer k1.release()
+ defer k1.release(t)
k2, err := getKey(t, addr2, private)
if err != nil {
return 0, err
}
- defer k2.release()
+ defer k2.release(t)
b1, b2 := m.lockBuckets(&k1, &k2)
defer b1.mu.Unlock()
@@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// Perform our atomic check.
if err := check(t, addr, val); err != nil {
b.mu.Unlock()
- w.key.release()
+ w.key.release(t)
return err
}
@@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo
// WaitComplete must be called when a Waiter previously added by WaitPrepare is
// no longer eligible to be woken.
-func (m *Manager) WaitComplete(w *Waiter) {
+func (m *Manager) WaitComplete(w *Waiter, t Target) {
// Remove w from the bucket it's in.
for {
b := w.bucket.Load()
@@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
}
// Release references held by the waiter.
- w.key.release()
+ w.key.release(t)
}
// LockPI attempts to lock the futex following the Priority-inheritance futex
@@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri
success, err := m.lockPILocked(w, t, addr, tid, b, try)
if err != nil {
- w.key.release()
+ w.key.release(t)
b.mu.Unlock()
return false, err
}
if success || try {
// Release waiter if it's not going to be a wait.
- w.key.release()
+ w.key.release(t)
}
b.mu.Unlock()
return success, nil
@@ -717,10 +720,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3
}
}
-// UnlockPI unlock the futex following the Priority-inheritance futex
-// rules. The address provided must contain the caller's TID. If there are
-// waiters, TID of the next waiter (FIFO) is set to the given address, and the
-// waiter woken up. If there are no waiters, 0 is set to the address.
+// UnlockPI unlocks the futex following the Priority-inheritance futex rules.
+// The address provided must contain the caller's TID. If there are waiters,
+// TID of the next waiter (FIFO) is set to the given address, and the waiter
+// woken up. If there are no waiters, 0 is set to the address.
func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
k, err := getKey(t, addr, private)
if err != nil {
@@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool
err = m.unlockPILocked(t, addr, tid, b, &k)
- k.release()
+ k.release(t)
b.mu.Unlock()
return err
}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 7c5c7665b..d0128c548 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -22,6 +22,7 @@ import (
"testing"
"unsafe"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -29,28 +30,33 @@ import (
// testData implements the Target interface, and allows us to
// treat the address passed for futex operations as an index in
// a byte slice for testing simplicity.
-type testData []byte
+type testData struct {
+ context.Context
+ data []byte
+}
const sizeofInt32 = 4
func newTestData(size uint) testData {
- return make([]byte, size)
+ return testData{
+ data: make([]byte, size),
+ }
}
func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
- val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new)
+ val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new)
return val, nil
}
func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
- if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) {
+ if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) {
return old, nil
}
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
- return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+ return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil
}
func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
@@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup.
if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 {
@@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) {
// Start waiting for wakeup.
w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff)
- defer m.WaitComplete(w)
+ defer m.WaitComplete(w, d)
// Perform a wakeup using the wrong bitmask.
if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 {
@@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) {
var ws [3]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform two wakeups.
@@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) {
// Start two waiters waiting for wakeup on different addresses.
w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform two wakeups on the second address.
if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 {
@@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address 0.
if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 {
@@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(0), which should succeed).
@@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address sizeofInt32.
w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Perform 10 wakeups on address sizeofInt32 (contingent on
// d.Op(1), which should fail).
@@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(0), which
@@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
// Add two waiters on address 0.
w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w1)
+ defer m.WaitComplete(w1, d)
w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(w2)
+ defer m.WaitComplete(w2, d)
// Add two waiters on address sizeofInt32.
w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w3)
+ defer m.WaitComplete(w3, d)
w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
- defer m.WaitComplete(w4)
+ defer m.WaitComplete(w4, d)
// Perform 10 wakeups on address 0 (unconditionally), and 10
// wakeups on address sizeofInt32 (contingent on d.Op(1), which
@@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) {
var ws [4]*Waiter
for i := range ws {
ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
- defer m.WaitComplete(ws[i])
+ defer m.WaitComplete(ws[i], d)
}
// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
@@ -472,7 +478,7 @@ func (t *testMutex) Lock() {
for {
// Attempt to grab the lock.
if atomic.CompareAndSwapUint32(
- (*uint32)(unsafe.Pointer(&t.d[t.a])),
+ (*uint32)(unsafe.Pointer(&t.d.data[t.a])),
testMutexUnlocked,
testMutexLocked) {
// Lock held.
@@ -490,7 +496,7 @@ func (t *testMutex) Lock() {
panic("WaitPrepare returned unexpected error: " + err.Error())
}
<-w.C
- t.m.WaitComplete(w)
+ t.m.WaitComplete(w, t.d)
}
}
@@ -498,7 +504,7 @@ func (t *testMutex) Lock() {
// This will notify any waiters via the futex manager.
func (t *testMutex) Unlock() {
// Unlock.
- atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked)
+ atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked)
// Notify all waiters.
t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5efeb3767..1028d13c6 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -34,7 +34,6 @@ package kernel
import (
"errors"
"fmt"
- "io"
"path/filepath"
"sync/atomic"
"time"
@@ -73,6 +72,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/state/wire"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -81,6 +81,10 @@ import (
// easy access everywhere. To be removed once VFS2 becomes the default.
var VFS2Enabled = false
+// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
+// easy access everywhere. To be removed once FUSE is completed.
+var FUSEEnabled = false
+
// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
@@ -194,11 +198,6 @@ type Kernel struct {
// cpuClockTickerSetting is protected by runningTasksMu.
cpuClockTickerSetting ktime.Setting
- // fdMapUids is an ever-increasing counter for generating FDTable uids.
- //
- // fdMapUids is mutable, and is accessed using atomic memory operations.
- fdMapUids uint64
-
// uniqueID is used to generate unique identifiers.
//
// uniqueID is mutable, and is accessed using atomic memory operations.
@@ -377,7 +376,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.netlinkPorts = port.New()
if VFS2Enabled {
- if err := k.vfs.Init(); err != nil {
+ ctx := k.SupervisorContext()
+ if err := k.vfs.Init(ctx); err != nil {
return fmt.Errorf("failed to initialize VFS: %v", err)
}
@@ -385,19 +385,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create pipefs filesystem: %v", err)
}
- defer pipeFilesystem.DecRef()
+ defer pipeFilesystem.DecRef(ctx)
pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create pipefs mount: %v", err)
}
k.pipeMount = pipeMount
- tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+ tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
if err != nil {
return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
}
- defer tmpfsFilesystem.DecRef()
- defer tmpfsRoot.DecRef()
+ defer tmpfsFilesystem.DecRef(ctx)
+ defer tmpfsRoot.DecRef(ctx)
shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create tmpfs mount: %v", err)
@@ -408,7 +408,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if err != nil {
return fmt.Errorf("failed to create sockfs filesystem: %v", err)
}
- defer socketFilesystem.DecRef()
+ defer socketFilesystem.DecRef(ctx)
socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return fmt.Errorf("failed to create sockfs mount: %v", err)
@@ -422,7 +422,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
// SaveTo saves the state of k to w.
//
// Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w io.Writer) error {
+func (k *Kernel) SaveTo(w wire.Writer) error {
saveStart := time.Now()
ctx := k.SupervisorContext()
@@ -431,8 +431,8 @@ func (k *Kernel) SaveTo(w io.Writer) error {
defer k.extMu.Unlock()
// Stop time.
- k.pauseTimeLocked()
- defer k.resumeTimeLocked()
+ k.pauseTimeLocked(ctx)
+ defer k.resumeTimeLocked(ctx)
// Evict all evictable MemoryFile allocations.
k.mf.StartEvictions()
@@ -448,18 +448,16 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// Remove all epoll waiter objects from underlying wait queues.
// NOTE: for programs to resume execution in future snapshot scenarios,
// we will need to re-establish these waiter objects after saving.
- k.tasks.unregisterEpollWaiters()
+ k.tasks.unregisterEpollWaiters(ctx)
// Clear the dirent cache before saving because Dirents must be Loaded in a
// particular order (parents before children), and Loading dirents from a cache
// breaks that order.
- if err := k.flushMountSourceRefs(); err != nil {
+ if err := k.flushMountSourceRefs(ctx); err != nil {
return err
}
- // Ensure that all pending asynchronous work is complete:
- // - inode and mount release
- // - asynchronuous IO
+ // Ensure that all inode and mount release operations have completed.
fs.AsyncBarrier()
// Once all fs work has completed (flushed references have all been released),
@@ -480,18 +478,18 @@ func (k *Kernel) SaveTo(w io.Writer) error {
//
// N.B. This will also be saved along with the full kernel save below.
cpuidStart := time.Now()
- if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil {
+ if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
return err
}
log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
// Save the kernel state.
kernelStart := time.Now()
- var stats state.Stats
- if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil {
+ stats, err := state.Save(k.SupervisorContext(), w, k)
+ if err != nil {
return err
}
- log.Infof("Kernel save stats: %s", &stats)
+ log.Infof("Kernel save stats: %s", stats.String())
log.Infof("Kernel save took [%s].", time.Since(kernelStart))
// Save the memory file's state.
@@ -508,7 +506,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
-func (k *Kernel) flushMountSourceRefs() error {
+func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -524,7 +522,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
@@ -534,7 +532,7 @@ func (k *Kernel) flushMountSourceRefs() error {
// each task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
if VFS2Enabled {
return nil
@@ -547,7 +545,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
if t.fdTable == nil {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
err = lastErr
}
@@ -558,7 +556,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
+ return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -605,7 +603,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
return nil
}
-func (ts *TaskSet) unregisterEpollWaiters() {
+func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
if VFS2Enabled {
return
@@ -626,7 +624,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
if _, ok := processed[t.fdTable]; ok {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
e.UnregisterEpollWaiters()
}
@@ -636,7 +634,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
}
// LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
loadStart := time.Now()
initAppCores := k.applicationCores
@@ -647,7 +645,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
// don't need to explicitly install it in the Kernel.
cpuidStart := time.Now()
var features cpuid.FeatureSet
- if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil {
+ if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
return err
}
log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -662,11 +660,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
// Load the kernel state.
kernelStart := time.Now()
- var stats state.Stats
- if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil {
+ stats, err := state.Load(k.SupervisorContext(), r, k)
+ if err != nil {
return err
}
- log.Infof("Kernel load stats: %s", &stats)
+ log.Infof("Kernel load stats: %s", stats.String())
log.Infof("Kernel load took [%s].", time.Since(kernelStart))
// rootNetworkNamespace should be populated after loading the state file.
@@ -897,13 +895,13 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if mntnsVFS2 == nil {
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
- mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+ mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
}
// Get the root directory from the MountNamespace.
root := args.MountNamespaceVFS2.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
wd := root // Default.
@@ -921,7 +919,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
fsContext = NewFSContextVFS2(root, wd, args.Umask)
@@ -936,7 +934,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
root := mntns.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
- defer root.DecRef()
+ defer root.DecRef(ctx)
// Grab the working directory.
remainingTraversals := args.MaxSymlinkTraversals
@@ -947,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
- defer wd.DecRef()
+ defer wd.DecRef(ctx)
}
opener = fsbridge.NewFSLookup(mntns, root, wd)
fsContext = newFSContext(root, wd, args.Umask)
@@ -1057,7 +1055,7 @@ func (k *Kernel) Start() error {
// If k was created by LoadKernelFrom, timers were stopped during
// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
// this is a no-op.
- k.resumeTimeLocked()
+ k.resumeTimeLocked(k.SupervisorContext())
// Start task goroutines.
k.tasks.mu.RLock()
defer k.tasks.mu.RUnlock()
@@ -1071,7 +1069,7 @@ func (k *Kernel) Start() error {
//
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
// must be locked.
-func (k *Kernel) pauseTimeLocked() {
+func (k *Kernel) pauseTimeLocked(ctx context.Context) {
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
// Kernel.Start().
if k.cpuClockTicker != nil {
@@ -1093,7 +1091,7 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.PauseTimer()
@@ -1115,7 +1113,7 @@ func (k *Kernel) pauseTimeLocked() {
//
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
// must be locked.
-func (k *Kernel) resumeTimeLocked() {
+func (k *Kernel) resumeTimeLocked(ctx context.Context) {
if k.cpuClockTicker != nil {
k.cpuClockTicker.Resume()
}
@@ -1129,7 +1127,7 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+ t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
if VFS2Enabled {
if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
tfd.ResumeTimer()
@@ -1254,13 +1252,22 @@ func (k *Kernel) Kill(es ExitStatus) {
}
// Pause requests that all tasks in k temporarily stop executing, and blocks
-// until all tasks in k have stopped. Multiple calls to Pause nest and require
-// an equal number of calls to Unpause to resume execution.
+// until all tasks and asynchronous I/O operations in k have stopped. Multiple
+// calls to Pause nest and require an equal number of calls to Unpause to
+// resume execution.
func (k *Kernel) Pause() {
k.extMu.Lock()
k.tasks.BeginExternalStop()
k.extMu.Unlock()
k.tasks.runningGoroutines.Wait()
+ k.tasks.aioGoroutines.Wait()
+}
+
+// ReceiveTaskStates receives full states for all tasks.
+func (k *Kernel) ReceiveTaskStates() {
+ k.extMu.Lock()
+ k.tasks.PullFullState()
+ k.extMu.Unlock()
}
// Unpause ends the effect of a previous call to Pause. If Unpause is called
@@ -1470,6 +1477,11 @@ func (k *Kernel) NowMonotonic() int64 {
return now
}
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+ return ktime.TcpipAfterFunc(k.realtimeClock, d, f)
+}
+
// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
// LoadFrom.
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
@@ -1507,7 +1519,7 @@ type SocketEntry struct {
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone() {
+func (s *SocketEntry) WeakRefGone(context.Context) {
s.k.extMu.Lock()
s.k.sockets.Remove(s)
s.k.extMu.Unlock()
@@ -1596,7 +1608,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return vfs.VirtualDentry{}
}
mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
// Root() takes a reference on the root dirent for us.
return mntns.Root()
case vfs.CtxMountNamespace:
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index f29dc0472..449643118 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -8,6 +8,7 @@ go_library(
"device.go",
"node.go",
"pipe.go",
+ "pipe_unsafe.go",
"pipe_util.go",
"reader.go",
"reader_writer.go",
@@ -20,10 +21,12 @@ go_library(
"//pkg/amutex",
"//pkg/buffer",
"//pkg/context",
+ "//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4b688c627..6497dc4ba 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
if !waitFor(&i.mu, &i.wWakeup, ctx) {
- r.DecRef()
+ r.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
@@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
// On a nonblocking, write-only open, the open fails with ENXIO if the
// read side isn't open yet.
if flags.NonBlocking {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ENXIO
}
if !waitFor(&i.mu, &i.rWakeup, ctx) {
- w.DecRef()
+ w.DecRef(ctx)
return nil, syserror.ErrInterrupted
}
}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ab75a87ff..ce0db5583 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) {
f := NewInodeOperations(ctx, perms, newNamedPipe(t))
rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
- rFile.DecRef()
+ rFile.DecRef(ctx)
wDone := make(chan struct{})
// This open for write should block because the reader is now gone.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 62c8691f1..297e8f28f 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -152,7 +152,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.
d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
// The p.Open calls below will each take a reference on the Dirent. We
// must drop the one we already have.
- defer d.DecRef()
+ defer d.DecRef(ctx)
return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
}
@@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.readLocked(ctx, ops)
+}
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
// Is the pipe empty?
if p.view.Size() == 0 {
if !p.HasWriters() {
@@ -246,7 +249,10 @@ type writeOps struct {
func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
p.mu.Lock()
defer p.mu.Unlock()
+ return p.writeLocked(ctx, ops)
+}
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
// Can't write to a pipe with no readers.
if !p.HasReaders() {
return 0, syscall.EPIPE
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index bda739dbe..fe97e9800 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -27,8 +27,8 @@ import (
func TestPipeRW(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
wantN := int64(len(msg))
@@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) {
func TestPipeReadBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, 65536, 4096)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
if n != 0 || err != syserror.ErrWouldBlock {
@@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := make([]byte, capacity+1)
n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
@@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
ctx := contexttest.Context(t)
r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(ctx)
+ defer w.DecRef(ctx)
msg := []byte("here's some bytes")
diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..dd60cba24
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "unsafe"
+)
+
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+ // Lock the two pipes in order of increasing address.
+ if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+ x.mu.Lock()
+ y.mu.Lock()
+ } else {
+ y.mu.Lock()
+ x.mu.Lock()
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index aacf28da2..6d58b682f 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -33,7 +33,7 @@ import (
// the old fs architecture.
// Release cleans up the pipe's state.
-func (p *Pipe) Release() {
+func (p *Pipe) Release(context.Context) {
p.rClose()
p.wClose()
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 7724b4452..ac18785c0 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Reader struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (r *Reader) Release() {
+func (r *Reader) Release(context.Context) {
r.Pipe.rClose()
// Wake up writers.
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index b54f08a30..28f998e45 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,8 +16,11 @@ package pipe
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/buffer"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -59,11 +62,13 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
//
// Preconditions: statusFlags should not contain an open access mode.
func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
- return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+ // Connected pipes share the same locks.
+ locks := &vfs.FileLocks{}
+ return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
}
// Open opens the pipe represented by vp.
-func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
defer vp.mu.Unlock()
@@ -73,7 +78,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
return nil, syserror.EINVAL
}
- fd := vp.newFD(mnt, vfsd, statusFlags)
+ fd := vp.newFD(mnt, vfsd, statusFlags, locks)
// Named pipes have special blocking semantics during open:
//
@@ -96,7 +101,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// If this pipe is being opened as blocking and there's no
// writer, we have to wait for a writer to open the other end.
if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
@@ -107,12 +112,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
// Non-blocking, write-only opens fail with ENXIO when the read
// side isn't open yet.
if statusFlags&linux.O_NONBLOCK != 0 {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.ENXIO
}
// Wait for a reader to open the other end.
if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EINTR
}
}
@@ -125,10 +130,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
}
// Preconditions: vp.mu must be held.
-func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription {
fd := &VFSPipeFD{
pipe: &vp.pipe,
}
+ fd.LockFD.Init(locks)
fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
DenyPRead: true,
DenyPWrite: true,
@@ -150,17 +156,20 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *
return &fd.vfsfd
}
-// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
type VFSPipeFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
pipe *Pipe
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *VFSPipeFD) Release() {
+func (fd *VFSPipeFD) Release(context.Context) {
var event waiter.EventMask
if fd.vfsfd.IsReadable() {
fd.pipe.rClose()
@@ -191,6 +200,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
}
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.ESPIPE
+}
+
// EventRegister implements waiter.Waitable.EventRegister.
func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.pipe.EventRegister(e, mask)
@@ -229,3 +243,226 @@ func (fd *VFSPipeFD) PipeSize() int64 {
func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
return fd.pipe.SetFifoSize(size)
}
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+ return usermem.IOSequence{
+ IO: fd,
+ Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+ }
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+ origCount := int64(len(dst))
+ n, err := fd.pipe.read(ctx, readOps{
+ left: func() int64 {
+ return int64(len(dst))
+ },
+ limit: func(l int64) {
+ dst = dst[:l]
+ },
+ read: func(view *buffer.View) (int64, error) {
+ n, err := view.ReadAt(dst, 0)
+ view.TrimFront(int64(n))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventOut)
+ }
+ if err == nil && n != origCount {
+ return int(n), syserror.ErrWouldBlock
+ }
+ return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+ origCount := int64(len(src))
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return int64(len(src))
+ },
+ limit: func(l int64) {
+ src = src[:l]
+ },
+ write: func(view *buffer.View) (int64, error) {
+ view.Append(src)
+ return int64(len(src)), nil
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return int(n), syserror.ErrWouldBlock
+ }
+ return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+ origCount := toZero
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return toZero
+ },
+ limit: func(l int64) {
+ toZero = l
+ },
+ write: func(view *buffer.View) (int64, error) {
+ view.Grow(view.Size()+toZero, true /* zero */)
+ return toZero, nil
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+ count := ars.NumBytes()
+ if count == 0 {
+ return 0, nil
+ }
+ origCount := count
+ n, err := fd.pipe.read(ctx, readOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ read: func(view *buffer.View) (int64, error) {
+ n, err := view.ReadToSafememWriter(dst, uint64(count))
+ view.TrimFront(int64(n))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventOut)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+ count := ars.NumBytes()
+ if count == 0 {
+ return 0, nil
+ }
+ origCount := count
+ n, err := fd.pipe.write(ctx, writeOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ write: func(view *buffer.View) (int64, error) {
+ n, err := view.WriteFromSafememReader(src, uint64(count))
+ return int64(n), err
+ },
+ })
+ if n > 0 {
+ fd.pipe.Notify(waiter.EventIn)
+ }
+ if err == nil && n != origCount {
+ return n, syserror.ErrWouldBlock
+ }
+ return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+ // How did a pipe get passed as the virtual address space to futex(2)?
+ panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+ panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+ panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+ return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+ return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+ if dst.pipe == src.pipe {
+ return 0, syserror.EINVAL
+ }
+
+ lockTwoPipes(dst.pipe, src.pipe)
+ defer dst.pipe.mu.Unlock()
+ defer src.pipe.mu.Unlock()
+
+ n, err := dst.pipe.writeLocked(ctx, writeOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ write: func(dstView *buffer.View) (int64, error) {
+ return src.pipe.readLocked(ctx, readOps{
+ left: func() int64 {
+ return count
+ },
+ limit: func(l int64) {
+ count = l
+ },
+ read: func(srcView *buffer.View) (int64, error) {
+ n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+ if n > 0 && removeFromSrc {
+ srcView.TrimFront(int64(n))
+ }
+ return int64(n), err
+ },
+ })
+ },
+ })
+ if n > 0 {
+ dst.pipe.Notify(waiter.EventIn)
+ src.pipe.Notify(waiter.EventOut)
+ }
+ return n, err
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 5bc6aa931..ef4b70ca3 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -15,6 +15,7 @@
package pipe
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -29,7 +30,7 @@ type Writer struct {
// Release implements fs.FileOperations.Release.
//
// This overrides ReaderWriter.Release.
-func (w *Writer) Release() {
+func (w *Writer) Release(context.Context) {
w.Pipe.wClose()
// Wake up readers.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e23e796ef..619b0cb7c 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1018,6 +1018,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
@@ -1044,10 +1047,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+
+ mm := t.MemoryManager()
+ t.p.PullFullState(mm.AddressSpace(), t.Arch())
+
ar := ars.Head()
n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
Ctx: t,
- IO: t.MemoryManager(),
+ IO: mm,
Addr: ar.Start,
Opts: usermem.IOOpts{
AddressSpaceActive: true,
@@ -1056,6 +1063,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
if err != nil {
return err
}
+ t.p.FullStateChanged()
ar.End -= usermem.Addr(n)
return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 0e19286de..5c4c622c2 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/syserror"
@@ -70,7 +71,7 @@ func (s *Session) incRef() {
//
// Precondition: callers must hold TaskSet.mu for writing.
func (s *Session) decRef() {
- s.refs.DecRefWithDestructor(func() {
+ s.refs.DecRefWithDestructor(nil, func(context.Context) {
// Remove translations from the leader.
for ns := s.leader.pidns; ns != nil; ns = ns.parent {
id := ns.sids[s]
@@ -162,7 +163,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
}
alive := true
- pg.refs.DecRefWithDestructor(func() {
+ pg.refs.DecRefWithDestructor(nil, func(context.Context) {
alive = false // don't bother with handleOrphan.
// Remove translations from the originator.
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index bfd779837..c211fc8d0 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -20,7 +20,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/memmap",
"//pkg/sentry/pgalloc",
- "//pkg/sentry/platform",
"//pkg/sentry/usage",
"//pkg/sync",
"//pkg/syserror",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index f66cfcc7f..13ec7afe0 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -45,7 +45,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -370,7 +369,7 @@ type Shm struct {
// fr is the offset into mfp.MemoryFile() that backs this contents of this
// segment. Immutable.
- fr platform.FileRange
+ fr memmap.FileRange
// mu protects all fields below.
mu sync.Mutex `state:"nosave"`
@@ -432,8 +431,8 @@ func (s *Shm) InodeID() uint64 {
// DecRef overrides refs.RefCount.DecRef with a destructor.
//
// Precondition: Caller must not hold s.mu.
-func (s *Shm) DecRef() {
- s.DecRefWithDestructor(s.destroy)
+func (s *Shm) DecRef(ctx context.Context) {
+ s.DecRefWithDestructor(ctx, s.destroy)
}
// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -643,7 +642,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
return nil
}
-func (s *Shm) destroy() {
+func (s *Shm) destroy(context.Context) {
s.mfp.MemoryFile().DecRef(s.fr)
s.registry.remove(s)
}
@@ -652,7 +651,7 @@ func (s *Shm) destroy() {
// destroyed once it has no references. MarkDestroyed may be called multiple
// times, and is safe to call after a segment has already been destroyed. See
// shmctl(IPC_RMID).
-func (s *Shm) MarkDestroyed() {
+func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
@@ -664,7 +663,7 @@ func (s *Shm) MarkDestroyed() {
//
// N.B. This cannot be the final DecRef, as the caller also
// holds a reference.
- s.DecRef()
+ s.DecRef(ctx)
return
}
}
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 8243bb93e..b07e1c1bd 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -76,7 +76,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) {
}
// Release implements fs.FileOperations.Release.
-func (s *SignalOperations) Release() {}
+func (s *SignalOperations) Release(context.Context) {}
// Mask returns the signal mask.
func (s *SignalOperations) Mask() linux.SignalSet {
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 4607cde2f..a83ce219c 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -98,6 +98,15 @@ func (s *syslog) Log() []byte {
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
}
+ if VFS2Enabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...)
+ if FUSEEnabled {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...)
+ }
+ }
+
time += rand.Float64() / 2
s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f48247c94..5aee699e7 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -68,6 +68,21 @@ type Task struct {
// runState is exclusive to the task goroutine.
runState taskRunState
+ // taskWorkCount represents the current size of the task work queue. It is
+ // used to avoid acquiring taskWorkMu when the queue is empty.
+ //
+ // Must accessed with atomic memory operations.
+ taskWorkCount int32
+
+ // taskWorkMu protects taskWork.
+ taskWorkMu sync.Mutex `state:"nosave"`
+
+ // taskWork is a queue of work to be executed before resuming user execution.
+ // It is similar to the task_work mechanism in Linux.
+ //
+ // taskWork is exclusive to the task goroutine.
+ taskWork []TaskWorker
+
// haveSyscallReturn is true if tc.Arch().Return() represents a value
// returned by a syscall (or set by ptrace after a syscall).
//
@@ -550,6 +565,10 @@ type Task struct {
// futexWaiter is exclusive to the task goroutine.
futexWaiter *futex.Waiter `state:"nosave"`
+ // robustList is a pointer to the head of the tasks's robust futex
+ // list.
+ robustList usermem.Addr
+
// startTime is the real time at which the task started. It is set when
// a Task is created or invokes execve(2).
//
@@ -711,17 +730,17 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
func (t *Task) IsChrooted() bool {
if VFS2Enabled {
realRoot := t.mountNamespaceVFS2.Root()
- defer realRoot.DecRef()
+ defer realRoot.DecRef(t)
root := t.fsContext.RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
return root != realRoot
}
realRoot := t.tg.mounts.Root()
- defer realRoot.DecRef()
+ defer realRoot.DecRef(t)
root := t.fsContext.RootDirectory()
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(t)
}
return root != realRoot
}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index e1ecca99e..9d7a9128f 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -161,6 +161,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
return 0, nil, syserror.EINVAL
}
+ // Pull task registers and FPU state, a cloned task will inherit the
+ // state of the current task.
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
// single clone(2) or unshare(2) call, the user namespace is guaranteed to
// be created first, giving the child (clone(2)) or caller (unshare(2))
@@ -237,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
var fdTable *FDTable
if opts.NewFiles {
- fdTable = t.fdTable.Fork()
+ fdTable = t.fdTable.Fork(t)
} else {
fdTable = t.fdTable
fdTable.IncRef()
@@ -294,7 +298,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
nt, err := t.tg.pidns.owner.NewTask(cfg)
if err != nil {
if opts.NewThreadGroup {
- tg.release()
+ tg.release(t)
}
return 0, nil, err
}
@@ -510,7 +514,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
var oldFDTable *FDTable
if opts.NewFiles {
oldFDTable = t.fdTable
- t.fdTable = oldFDTable.Fork()
+ t.fdTable = oldFDTable.Fork(t)
}
var oldFSContext *FSContext
if opts.NewFSContext {
@@ -519,10 +523,10 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
t.mu.Unlock()
if oldFDTable != nil {
- oldFDTable.DecRef()
+ oldFDTable.DecRef(t)
}
if oldFSContext != nil {
- oldFSContext.DecRef()
+ oldFSContext.DecRef(t)
}
return nil
}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 00c425cca..5e4fb3e3a 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -198,11 +198,18 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
t.tg.pidns.owner.mu.Unlock()
+ oldFDTable := t.fdTable
+ t.fdTable = t.fdTable.Fork(t)
+ oldFDTable.DecRef(t)
+
// Remove FDs with the CloseOnExec flag set.
- t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
+ t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
return flags.CloseOnExec
})
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// NOTE(b/30815691): We currently do not implement privileged
// executables (set-user/group-ID bits and file capabilities). This
// allows us to unconditionally enable user dumpability on the new mm.
@@ -219,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tc = *r.tc
t.mu.Unlock()
t.unstopVforkParent()
+ t.p.FullStateChanged()
// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
t.MemoryManager().Activate(t)
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c4ade6e8e..c165d6cb1 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -253,6 +253,9 @@ func (*runExitMain) execute(t *Task) taskRunState {
}
}
+ // Handle the robust futex list.
+ t.exitRobustList()
+
// Deactivate the address space and update max RSS before releasing the
// task's MM.
t.Deactivate()
@@ -266,12 +269,12 @@ func (*runExitMain) execute(t *Task) taskRunState {
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
t.unstopVforkParent()
- t.fsContext.DecRef()
- t.fdTable.DecRef()
+ t.fsContext.DecRef(t)
+ t.fdTable.DecRef(t)
t.mu.Lock()
if t.mountNamespaceVFS2 != nil {
- t.mountNamespaceVFS2.DecRef()
+ t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
t.mu.Unlock()
@@ -279,7 +282,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
- t.tg.release()
+ t.tg.release(t)
}
// Detach tracees.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index a53e77c9f..4b535c949 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -52,3 +53,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
return t.MemoryManager().GetSharedFutexKey(t, addr)
}
+
+// GetRobustList sets the robust futex list for the task.
+func (t *Task) GetRobustList() usermem.Addr {
+ t.mu.Lock()
+ addr := t.robustList
+ t.mu.Unlock()
+ return addr
+}
+
+// SetRobustList sets the robust futex list for the task.
+func (t *Task) SetRobustList(addr usermem.Addr) {
+ t.mu.Lock()
+ t.robustList = addr
+ t.mu.Unlock()
+}
+
+// exitRobustList walks the robust futex list, marking locks dead and notifying
+// wakers. It corresponds to Linux's exit_robust_list(). Following Linux,
+// errors are silently ignored.
+func (t *Task) exitRobustList() {
+ t.mu.Lock()
+ addr := t.robustList
+ t.robustList = 0
+ t.mu.Unlock()
+
+ if addr == 0 {
+ return
+ }
+
+ var rl linux.RobustListHead
+ if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil {
+ return
+ }
+
+ next := rl.List
+ done := 0
+ var pendingLockAddr usermem.Addr
+ if rl.ListOpPending != 0 {
+ pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset)
+ }
+
+ // Wake up normal elements.
+ for usermem.Addr(next) != addr {
+ // We traverse to the next element of the list before we
+ // actually wake anything. This prevents the race where waking
+ // this futex causes a modification of the list.
+ thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+
+ // Try to decode the next element in the list before waking the
+ // current futex. But don't check the error until after we've
+ // woken the current futex. Linux does it in this order too
+ _, nextErr := t.CopyIn(usermem.Addr(next), &next)
+
+ // Wakeup the current futex if it's not pending.
+ if thisLockAddr != pendingLockAddr {
+ t.wakeRobustListOne(thisLockAddr)
+ }
+
+ // If there was an error copying the next futex, we must bail.
+ if nextErr != nil {
+ break
+ }
+
+ // This is a user structure, so it could be a massive list, or
+ // even contain a loop if they are trying to mess with us. We
+ // cap traversal to prevent that.
+ done++
+ if done >= linux.ROBUST_LIST_LIMIT {
+ break
+ }
+ }
+
+ // Is there a pending entry to wake?
+ if pendingLockAddr != 0 {
+ t.wakeRobustListOne(pendingLockAddr)
+ }
+}
+
+// wakeRobustListOne wakes a single futex from the robust list.
+func (t *Task) wakeRobustListOne(addr usermem.Addr) {
+ // Bit 0 in address signals PI futex.
+ pi := addr&1 == 1
+ addr = addr &^ 1
+
+ // Load the futex.
+ f, err := t.LoadUint32(addr)
+ if err != nil {
+ // Can't read this single value? Ignore the problem.
+ // We can wake the other futexes in the list.
+ return
+ }
+
+ tid := uint32(t.ThreadID())
+ for {
+ // Is this held by someone else?
+ if f&linux.FUTEX_TID_MASK != tid {
+ return
+ }
+
+ // This thread is dying and it's holding this futex. We need to
+ // set the owner died bit and wake up any waiters.
+ newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED
+ if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil {
+ return
+ } else if curF != f {
+ // Futex changed out from under us. Try again...
+ f = curF
+ continue
+ }
+
+ // Wake waiters if there are any.
+ if f&linux.FUTEX_WAITERS != 0 {
+ private := f&linux.FUTEX_PRIVATE_FLAG != 0
+ if pi {
+ t.Futex().UnlockPI(t, addr, tid, private)
+ return
+ }
+ t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1)
+ }
+
+ // Done.
+ return
+ }
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index eeccaa197..d23cea802 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -27,6 +27,9 @@ const (
// maxStackDebugBytes is the maximum number of user stack bytes that may be
// printed by debugDumpStack.
maxStackDebugBytes = 1024
+ // maxCodeDebugBytes is the maximum number of user code bytes that may be
+ // printed by debugDumpCode.
+ maxCodeDebugBytes = 128
)
// Infof logs an formatted info message by calling log.Infof.
@@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool {
func (t *Task) DebugDumpState() {
t.debugDumpRegisters()
t.debugDumpStack()
+ t.debugDumpCode()
if mm := t.MemoryManager(); mm != nil {
t.Debugf("Mappings:\n%s", mm)
}
@@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() {
}
}
+// debugDumpCode logs user code contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpCode() {
+ if !t.IsLogging(log.Debug) {
+ return
+ }
+ m := t.MemoryManager()
+ if m == nil {
+ t.Debugf("Memory manager for task is gone, skipping application code dump.")
+ return
+ }
+ t.Debugf("Code:")
+ // Print code on both sides of the instruction register.
+ start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
+ // Round addr down to a 16-byte boundary.
+ start &= ^usermem.Addr(15)
+ // Print 16 bytes per line, one byte at a time.
+ for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
+ addr, ok := start.AddLength(offset)
+ if !ok {
+ break
+ }
+ var data [16]byte
+ n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ // Print as much of the line as we can, even if an error was
+ // encountered.
+ if n > 0 {
+ t.Debugf("%x: % x", addr, data[:n])
+ }
+ if err != nil {
+ t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ break
+ }
+ }
+}
+
// trace definitions.
//
// Note that all region names are prefixed by ':' in order to ensure that they
@@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
return
}
- defer file.DecRef()
+ defer file.DecRef(t)
trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index d654dd997..abaf29216 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -167,7 +167,22 @@ func (app *runApp) execute(t *Task) taskRunState {
return (*runInterrupt)(nil)
}
- // We're about to switch to the application again. If there's still a
+ // Execute any task work callbacks before returning to user space.
+ if atomic.LoadInt32(&t.taskWorkCount) > 0 {
+ t.taskWorkMu.Lock()
+ queue := t.taskWork
+ t.taskWork = nil
+ atomic.StoreInt32(&t.taskWorkCount, 0)
+ t.taskWorkMu.Unlock()
+
+ // Do not hold taskWorkMu while executing task work, which may register
+ // more work.
+ for _, work := range queue {
+ work.TaskWork(t)
+ }
+ }
+
+ // We're about to switch to the application again. If there's still an
// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
// restart the syscall that was interrupted. If there's a saved signal
// mask, restore it. (Note that restoring the saved signal mask may unblock
@@ -245,7 +260,7 @@ func (app *runApp) execute(t *Task) taskRunState {
region := trace.StartRegion(t.traceContext, runRegion)
t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
- info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+ info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
region.End()
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 79766cafe..cff2a8365 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -255,10 +255,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
}
}
+ mm := t.MemoryManager()
// Set up the signal handler. If we have a saved signal mask, the signal
// handler should run with the current mask, but sigreturn should restore
// the saved one.
- st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+ st := &arch.Stack{t.Arch(), mm, sp}
mask := t.signalMask
if t.haveSavedSignalMask {
mask = t.savedSignalMask
@@ -273,12 +274,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
// Please see the linux code as reference:
// linux/arch/arm64/kernel/signal.c:setup_return()
if act.Flags&linux.SA_RESTORER == 0 {
- act.Restorer = t.MemoryManager().VDSOSigReturn()
+ act.Restorer = mm.VDSOSigReturn()
}
if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
return err
}
+ t.p.FullStateChanged()
t.haveSavedSignalMask = false
// Add our signal mask.
@@ -310,6 +312,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
t.SetSignalMask(sigset &^ UnblockableSignals)
+ t.p.FullStateChanged()
return ctrlResume, nil
}
@@ -636,6 +639,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() arch.SignalStack {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
alt := t.signalStack
if t.onSignalStack(alt) {
alt.Flags |= arch.SignalStackFlagOnStack
@@ -1050,6 +1054,8 @@ func (*runInterrupt) execute(t *Task) taskRunState {
// Are there signals pending?
if info := t.dequeueSignalLocked(t.signalMask); info != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+
if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
// Indicate that we've dequeued a stop signal before unlocking the
// signal mutex; initiateGroupStop will check for races with
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 8485fb4b6..64c1e120a 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -102,10 +102,10 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
cfg.TaskContext.release()
- cfg.FSContext.DecRef()
- cfg.FDTable.DecRef()
+ cfg.FSContext.DecRef(t)
+ cfg.FDTable.DecRef(t)
if cfg.MountNamespaceVFS2 != nil {
- cfg.MountNamespaceVFS2.DecRef()
+ cfg.MountNamespaceVFS2.DecRef(t)
}
return nil, err
}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 10c6e455c..296735d32 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -205,6 +205,22 @@ func (ts *TaskSet) BeginExternalStop() {
}
}
+// PullFullState receives full states for all tasks.
+func (ts *TaskSet) PullFullState() {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ if ts.Root == nil {
+ return
+ }
+ for t := range ts.Root.tids {
+ t.Activate()
+ if mm := t.MemoryManager(); mm != nil {
+ t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
+ }
+ t.Deactivate()
+ }
+}
+
// EndExternalStop indicates the end of an external stop started by a previous
// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
// goroutines to resume.
diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go
new file mode 100644
index 000000000..dda5a433a
--- /dev/null
+++ b/pkg/sentry/kernel/task_work.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync/atomic"
+
+// TaskWorker is a deferred task.
+//
+// This must be savable.
+type TaskWorker interface {
+ // TaskWork will be executed prior to returning to user space. Note that
+ // TaskWork may call RegisterWork again, but this will not be executed until
+ // the next return to user space, unlike in Linux. This effectively allows
+ // registration of indefinite user return hooks, but not by default.
+ TaskWork(t *Task)
+}
+
+// RegisterWork can be used to register additional task work that will be
+// performed prior to returning to user space. See TaskWorker.TaskWork for
+// semantics regarding registration.
+func (t *Task) RegisterWork(work TaskWorker) {
+ t.taskWorkMu.Lock()
+ defer t.taskWorkMu.Unlock()
+ atomic.AddInt32(&t.taskWorkCount, 1)
+ t.taskWork = append(t.taskWork, work)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 52849f5b3..0b34c0099 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -308,7 +308,7 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
}
// release releases the thread group's resources.
-func (tg *ThreadGroup) release() {
+func (tg *ThreadGroup) release(t *Task) {
// Timers must be destroyed without holding the TaskSet or signal mutexes
// since timers send signals with Timer.mu locked.
tg.itimerRealTimer.Destroy()
@@ -325,7 +325,7 @@ func (tg *ThreadGroup) release() {
it.DestroyTimer()
}
if tg.mounts != nil {
- tg.mounts.DecRef()
+ tg.mounts.DecRef(t)
}
}
@@ -366,7 +366,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
// terminal is stolen, and all processes that had it as controlling
// terminal lose it." - tty_ioctl(4)
if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
- if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+ // Stealing requires CAP_SYS_ADMIN in the root user namespace.
+ if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 {
return syserror.EPERM
}
// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index bf2dabb6e..872e1a82d 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -87,6 +87,13 @@ type TaskSet struct {
// at time of save (but note that this is not necessarily the same thing as
// sync.WaitGroup's zero value).
runningGoroutines sync.WaitGroup `state:"nosave"`
+
+ // aioGoroutines is the number of goroutines running async I/O
+ // callbacks.
+ //
+ // aioGoroutines is not saved but is required to be zero at the time of
+ // save.
+ aioGoroutines sync.WaitGroup `state:"nosave"`
}
// newTaskSet returns a new, empty TaskSet.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 7ba7dc50c..2817aa3ba 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -6,6 +6,7 @@ go_library(
name = "time",
srcs = [
"context.go",
+ "tcpip.go",
"time.go",
],
visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go
new file mode 100644
index 000000000..c4474c0cf
--- /dev/null
+++ b/pkg/sentry/kernel/time/tcpip.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+ "sync"
+ "time"
+)
+
+// TcpipAfterFunc waits for duration to elapse according to clock then runs fn.
+// The timer is started immediately and will fire exactly once.
+func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer {
+ timer := &TcpipTimer{
+ clock: clock,
+ }
+ timer.notifier = functionNotifier{
+ fn: func() {
+ // tcpip.Timer.Stop() explicitly states that the function is called in a
+ // separate goroutine that Stop() does not synchronize with.
+ // Timer.Destroy() synchronizes with calls to TimerListener.Notify().
+ // This is semantically meaningful because, in the former case, it's
+ // legal to call tcpip.Timer.Stop() while holding locks that may also be
+ // taken by the function, but this isn't so in the latter case. Most
+ // immediately, Timer calls TimerListener.Notify() while holding
+ // Timer.mu. A deadlock occurs without spawning a goroutine:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify()
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock!
+ //
+ // Spawning a goroutine avoids the deadlock:
+ // T1: (Timer expires)
+ // => Timer.Tick() <- Timer.mu.Lock() called
+ // => TimerListener.Notify() <- Launches T2
+ // T2:
+ // => Timer.Stop()
+ // => Timer.Destroy() <- Timer.mu.Lock() called, blocks
+ // T1:
+ // => (returns) <- Timer.mu.Unlock() called
+ // T2:
+ // => (continues) <- No deadlock!
+ go func() {
+ timer.Stop()
+ fn()
+ }()
+ },
+ }
+ timer.Reset(duration)
+ return timer
+}
+
+// TcpipTimer is a resettable timer with variable duration expirations.
+// Implements tcpip.Timer, which does not define a Destroy method; instead, all
+// resources are released after timer expiration and calls to Timer.Stop.
+//
+// Must be created by AfterFunc.
+type TcpipTimer struct {
+ // clock is the time source. clock is immutable.
+ clock Clock
+
+ // notifier is called when the Timer expires. notifier is immutable.
+ notifier functionNotifier
+
+ // mu protects t.
+ mu sync.Mutex
+
+ // t stores the latest running Timer. This is replaced whenever Reset is
+ // called since Timer cannot be restarted once it has been Destroyed by Stop.
+ //
+ // This field is nil iff Stop has been called.
+ t *Timer
+}
+
+// Stop implements tcpip.Timer.Stop.
+func (r *TcpipTimer) Stop() bool {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ return false
+ }
+ _, lastSetting := r.t.Swap(Setting{})
+ r.t.Destroy()
+ r.t = nil
+ return lastSetting.Enabled
+}
+
+// Reset implements tcpip.Timer.Reset.
+func (r *TcpipTimer) Reset(d time.Duration) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if r.t == nil {
+ r.t = NewTimer(r.clock, &r.notifier)
+ }
+
+ r.t.Swap(Setting{
+ Enabled: true,
+ Period: 0,
+ Next: r.clock.Now().Add(d),
+ })
+}
+
+// functionNotifier is a TimerListener that runs a function.
+//
+// functionNotifier cannot be saved or loaded.
+type functionNotifier struct {
+ fn func()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) {
+ f.fn()
+ return Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (f *functionNotifier) Destroy() {}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index da0ea7bb5..7c4fefb16 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,8 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/log"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -90,7 +90,7 @@ type Timekeeper struct {
// NewTimekeeper does not take ownership of paramPage.
//
// SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) {
return &Timekeeper{
params: NewVDSOParamPage(mfp, paramPage),
}, nil
@@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() {
timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
t.wg.Add(1)
go func() { // S/R-SAFE: stopped during save.
+ defer t.wg.Done()
for {
// Start with an update immediately, so the clocks are
// ready ASAP.
@@ -209,9 +210,6 @@ func (t *Timekeeper) startUpdater() {
p.realtimeBaseRef = int64(realtimeParams.BaseRef)
p.realtimeFrequency = realtimeParams.Frequency
}
-
- log.Debugf("Updating VDSO parameters: %+v", p)
-
return p
}); err != nil {
log.Warningf("Unable to update VDSO parameter page: %v", err)
@@ -220,7 +218,6 @@ func (t *Timekeeper) startUpdater() {
select {
case <-timer.C:
case <-t.stop:
- t.wg.Done()
return
}
}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index f1b3c212c..290c32466 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -19,8 +19,8 @@ import (
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -58,7 +58,7 @@ type vdsoParams struct {
type VDSOParamPage struct {
// The parameter page is fr, allocated from mfp.MemoryFile().
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
// seq is the current sequence count written to the page.
//
@@ -81,7 +81,7 @@ type VDSOParamPage struct {
// * VDSOParamPage must be the only writer to fr.
//
// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
return &VDSOParamPage{mfp: mfp, fr: fr}
}
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index c6aa65f28..34bdb0b69 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -30,9 +30,6 @@ go_library(
"//pkg/rand",
"//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/fs",
- "//pkg/sentry/fs/anon",
- "//pkg/sentry/fs/fsutil",
"//pkg/sentry/fsbridge",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/limits",
@@ -45,6 +42,5 @@ go_library(
"//pkg/syserr",
"//pkg/syserror",
"//pkg/usermem",
- "//pkg/waiter",
],
)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 616fafa2c..20dd1cc21 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -90,14 +90,23 @@ type elfInfo struct {
sharedObject bool
}
+// fullReader interface extracts the ReadFull method from fsbridge.File so that
+// client code does not need to define an entire fsbridge.File when only read
+// functionality is needed.
+//
+// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap
+// vfs.FileDescription's PRead/Read instead.
+type fullReader interface {
+ // ReadFull is the same as fsbridge.File.ReadFull.
+ ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+}
+
// parseHeader parse the ELF header, verifying that this is a supported ELF
// file and returning the ELF program headers.
//
// This is similar to elf.NewFile, except that it is more strict about what it
// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
-//
-// ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
// Check ident first; it will tell us the endianness of the rest of the
// structs.
var ident [elf.EI_NIDENT]byte
@@ -272,7 +281,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr
}
defer func() {
if mopts.MappingIdentity != nil {
- mopts.MappingIdentity.DecRef()
+ mopts.MappingIdentity.DecRef(ctx)
}
}()
if err := f.ConfigureMMap(ctx, &mopts); err != nil {
@@ -654,7 +663,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
return loadedELF{}, nil, err
}
- defer intFile.DecRef()
+ defer intFile.DecRef(ctx)
interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 88449fe95..8d6802ea3 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -80,22 +79,6 @@ type LoadArgs struct {
Features *cpuid.FeatureSet
}
-// readFull behaves like io.ReadFull for an *fs.File.
-func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
- var total int64
- for dst.NumBytes() > 0 {
- n, err := f.Preadv(ctx, dst, offset+total)
- total += n
- if err == io.EOF && total != 0 {
- return total, io.ErrUnexpectedEOF
- } else if err != nil {
- return total, err
- }
- dst = dst.DropFirst64(n)
- }
- return total, nil
-}
-
// openPath opens args.Filename and checks that it is valid for loading.
//
// openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
@@ -171,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
return loadedELF{}, nil, nil, nil, err
}
// Ensure file is release in case the code loops or errors out.
- defer args.File.DecRef()
+ defer args.File.DecRef(ctx)
} else {
if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
return loadedELF{}, nil, nil, nil, err
@@ -238,14 +221,14 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
// Load the executable itself.
loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
if err != nil {
- return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
+ return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
}
- defer file.DecRef()
+ defer file.DecRef(ctx)
// Load the VDSO.
vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
if err != nil {
- return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
+ return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
}
// Setup the heap. brk starts at the next page after the end of the
@@ -309,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
m.SetEnvvStart(sl.EnvvStart)
m.SetEnvvEnd(sl.EnvvEnd)
m.SetAuxv(auxv)
- m.SetExecutable(file)
+ m.SetExecutable(ctx, file)
symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn")
if err != nil {
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 00977fc08..05a294fe6 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -26,10 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/anon"
- "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
- "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,7 +33,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
- "gvisor.dev/gvisor/pkg/waiter"
)
const vdsoPrelink = 0xffffffffff700000
@@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} {
}
}
-// byteReader implements fs.FileOperations for reading from a []byte source.
-type byteReader struct {
- fsutil.FileNoFsync `state:"nosave"`
- fsutil.FileNoIoctl `state:"nosave"`
- fsutil.FileNoMMap `state:"nosave"`
- fsutil.FileNoSplice `state:"nosave"`
- fsutil.FileNoopFlush `state:"nosave"`
- fsutil.FileNoopRelease `state:"nosave"`
- fsutil.FileNotDirReaddir `state:"nosave"`
- fsutil.FilePipeSeek `state:"nosave"`
- fsutil.FileUseInodeUnstableAttr `state:"nosave"`
- waiter.AlwaysReady `state:"nosave"`
-
+type byteFullReader struct {
data []byte
}
-var _ fs.FileOperations = (*byteReader)(nil)
-
-// newByteReaderFile creates a fake file to read data from.
-//
-// TODO(gvisor.dev/issue/1623): Convert to VFS2.
-func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
- // Create a fake inode.
- inode := fs.NewInode(
- ctx,
- &fsutil.SimpleFileInode{},
- fs.NewPseudoMountSource(ctx),
- fs.StableAttr{
- Type: fs.Anonymous,
- DeviceID: anon.PseudoDevice.DeviceID(),
- InodeID: anon.PseudoDevice.NextIno(),
- BlockSize: usermem.PageSize,
- })
-
- // Use the fake inode to create a fake dirent.
- dirent := fs.NewTransientDirent(inode)
- defer dirent.DecRef()
-
- // Use the fake dirent to make a fake file.
- flags := fs.FileFlags{Read: true, Pread: true}
- return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
- data: data,
- })
-}
-
-func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
if offset < 0 {
return 0, syserror.EINVAL
}
@@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ
return int64(n), err
}
-func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
- panic("Write not supported")
-}
-
// validateVDSO checks that the VDSO can be loaded by loadVDSO.
//
// VDSOs are special (see below). Since we are going to map the VDSO directly
@@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
// * PT_LOAD segments don't extend beyond the end of the file.
//
// ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) {
info, err := parseHeader(ctx, f)
if err != nil {
log.Infof("Unable to parse VDSO header: %v", err)
@@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) {
// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
// param page for updating by the kernel.
-func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
- vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
+ vdsoFile := &byteFullReader{data: vdsoBin}
// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
// nil context can be passed.
info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
- vdsoFile.DecRef()
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index a98b66de1..2c95669cd 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -28,9 +28,21 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "file_range",
+ out = "file_range.go",
+ package = "memmap",
+ prefix = "File",
+ template = "//pkg/segment:generic_range",
+ types = {
+ "T": "uint64",
+ },
+)
+
go_library(
name = "memmap",
srcs = [
+ "file_range.go",
"mappable_range.go",
"mapping_set.go",
"mapping_set_impl.go",
@@ -40,7 +52,7 @@ go_library(
deps = [
"//pkg/context",
"//pkg/log",
- "//pkg/sentry/platform",
+ "//pkg/safemem",
"//pkg/syserror",
"//pkg/usermem",
],
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index c6db9fc8f..65d83096f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -19,12 +19,12 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/usermem"
)
// Mappable represents a memory-mappable object, a mutable mapping from uint64
-// offsets to (platform.File, uint64 File offset) pairs.
+// offsets to (File, uint64 File offset) pairs.
//
// See mm/mm.go for Mappable's place in the lock order.
//
@@ -74,7 +74,7 @@ type Mappable interface {
// Translations are valid until invalidated by a callback to
// MappingSpace.Invalidate or until the caller removes its mapping of the
// translated range. Mappable implementations must ensure that at least one
- // reference is held on all pages in a platform.File that may be the result
+ // reference is held on all pages in a File that may be the result
// of a valid Translation.
//
// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
@@ -100,7 +100,7 @@ type Translation struct {
Source MappableRange
// File is the mapped file.
- File platform.File
+ File File
// Offset is the offset into File at which this Translation begins.
Offset uint64
@@ -110,9 +110,9 @@ type Translation struct {
Perms usermem.AccessType
}
-// FileRange returns the platform.FileRange represented by t.
-func (t Translation) FileRange() platform.FileRange {
- return platform.FileRange{t.Offset, t.Offset + t.Source.Length()}
+// FileRange returns the FileRange represented by t.
+func (t Translation) FileRange() FileRange {
+ return FileRange{t.Offset, t.Offset + t.Source.Length()}
}
// CheckTranslateResult returns an error if (ts, terr) does not satisfy all
@@ -238,7 +238,7 @@ type MappingIdentity interface {
IncRef()
// DecRef decrements the MappingIdentity's reference count.
- DecRef()
+ DecRef(ctx context.Context)
// MappedName returns the application-visible name shown in
// /proc/[pid]/maps.
@@ -360,4 +360,57 @@ type MMapOpts struct {
//
// TODO(jamieliu): Replace entirely with MappingIdentity?
Hint string
+
+ // Force means to skip validation checks of Addr and Length. It can be
+ // used to create special mappings below mm.layout.MinAddr and
+ // mm.layout.MaxAddr. It has to be used with caution.
+ //
+ // If Force is true, Unmap and Fixed must be true.
+ Force bool
+}
+
+// File represents a host file that may be mapped into an platform.AddressSpace.
+type File interface {
+ // All pages in a File are reference-counted.
+
+ // IncRef increments the reference count on all pages in fr.
+ //
+ // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+ // 0. At least one reference must be held on all pages in fr. (The File
+ // interface does not provide a way to acquire an initial reference;
+ // implementors may define mechanisms for doing so.)
+ IncRef(fr FileRange)
+
+ // DecRef decrements the reference count on all pages in fr.
+ //
+ // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+ // 0. At least one reference must be held on all pages in fr.
+ DecRef(fr FileRange)
+
+ // MapInternal returns a mapping of the given file offsets in the invoking
+ // process' address space for reading and writing.
+ //
+ // Note that fr.Start and fr.End need not be page-aligned.
+ //
+ // Preconditions: fr.Length() > 0. At least one reference must be held on
+ // all pages in fr.
+ //
+ // Postconditions: The returned mapping is valid as long as at least one
+ // reference is held on the mapped pages.
+ MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
+
+ // FD returns the file descriptor represented by the File.
+ //
+ // The only permitted operation on the returned file descriptor is to map
+ // pages from it consistent with the requirements of AddressSpace.MapFile.
+ FD() int
+}
+
+// FileRange represents a range of uint64 offsets into a File.
+//
+// type FileRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (fr FileRange) String() string {
+ return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
}
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a036ce53c..f9d0837a1 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -7,14 +7,14 @@ go_template_instance(
name = "file_refcount_set",
out = "file_refcount_set.go",
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "mm",
prefix = "fileRefcount",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "int32",
"Functions": "fileRefcountSetFunctions",
},
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 379148903..16fea53c4 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -243,7 +242,7 @@ type aioMappable struct {
refs.AtomicRefCount
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
}
var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
@@ -259,8 +258,8 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
}
// DecRef implements refs.RefCounter.DecRef.
-func (m *aioMappable) DecRef() {
- m.AtomicRefCount.DecRefWithDestructor(func() {
+func (m *aioMappable) DecRef(ctx context.Context) {
+ m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
m.mfp.MemoryFile().DecRef(m.fr)
})
}
@@ -368,7 +367,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
if err != nil {
return 0, err
}
- defer m.DecRef()
+ defer m.DecRef(ctx)
addr, err := mm.MMap(ctx, memmap.MMapOpts{
Length: aioRingBufferSize,
MappingIdentity: m,
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index aac56679b..09dbc06a4 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -57,6 +57,8 @@ func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arc
// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
// clone() (without CLONE_VM).
func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+ mm.AddressSpace().PreFork()
+ defer mm.AddressSpace().PostFork()
mm.metadataMu.Lock()
defer mm.metadataMu.Unlock()
mm.mappingMu.RLock()
@@ -258,7 +260,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) {
mm.executable = nil
mm.metadataMu.Unlock()
if exe != nil {
- exe.DecRef()
+ exe.DecRef(ctx)
}
mm.activeMu.Lock()
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 28e5057f7..0cfd60f6c 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -15,6 +15,7 @@
package mm
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/usermem"
@@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File {
// SetExecutable sets the executable.
//
// This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
+func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) {
mm.metadataMu.Lock()
// Grab a new reference.
@@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
// Do this without holding the lock, since it may wind up doing some
// I/O to sync the dirent, etc.
if orig != nil {
- orig.DecRef()
+ orig.DecRef(ctx)
}
}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 6db7c3d40..3e85964e4 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -25,7 +25,7 @@
// Locks taken by memmap.Mappable.Translate
// mm.privateRefs.mu
// platform.AddressSpace locks
-// platform.File locks
+// memmap.File locks
// mm.aioManager.mu
// mm.AIOContext.mu
//
@@ -396,7 +396,7 @@ type pma struct {
// file is the file mapped by this pma. Only pmas for which file ==
// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
// the corresponding file range while they exist.
- file platform.File `state:"nosave"`
+ file memmap.File `state:"nosave"`
// off is the offset into file at which this pma begins.
//
@@ -436,7 +436,7 @@ type pma struct {
private bool
// If internalMappings is not empty, it is the cached return value of
- // file.MapInternal for the platform.FileRange mapped by this pma.
+ // file.MapInternal for the memmap.FileRange mapped by this pma.
internalMappings safemem.BlockSeq `state:"nosave"`
}
@@ -469,10 +469,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 {
func (fileRefcountSetFunctions) ClearValue(_ *int32) {
}
-func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) {
return rc1, rc1 == rc2
}
-func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) {
return rc, rc
}
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 62e4c20af..930ec895f 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -21,7 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/safecopy"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -604,7 +603,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
}
}
-// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// Pin returns the memmap.File ranges currently mapped by addresses in ar in
// mm, acquiring a reference on the returned ranges which the caller must
// release by calling Unpin. If not all addresses are mapped, Pin returns a
// non-nil error. Note that Pin may return both a non-empty slice of
@@ -674,15 +673,15 @@ type PinnedRange struct {
Source usermem.AddrRange
// File is the mapped file.
- File platform.File
+ File memmap.File
// Offset is the offset into File at which this PinnedRange begins.
Offset uint64
}
-// FileRange returns the platform.File offsets mapped by pr.
-func (pr PinnedRange) FileRange() platform.FileRange {
- return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+// FileRange returns the memmap.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() memmap.FileRange {
+ return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
}
// Unpin releases the reference held by prs.
@@ -857,7 +856,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf
}
// incPrivateRef acquires a reference on private pages in fr.
-func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
mm.privateRefs.mu.Lock()
defer mm.privateRefs.mu.Unlock()
refSet := &mm.privateRefs.refs
@@ -878,8 +877,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
}
// decPrivateRef releases a reference on private pages in fr.
-func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
- var freed []platform.FileRange
+func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
+ var freed []memmap.FileRange
mm.privateRefs.mu.Lock()
refSet := &mm.privateRefs.refs
@@ -951,7 +950,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa
// Discard internal mappings instead of trying to merge them, since merging
// them requires an allocation and getting them again from the
- // platform.File might not.
+ // memmap.File might not.
pma1.internalMappings = safemem.BlockSeq{}
return pma1, true
}
@@ -1012,12 +1011,12 @@ func (pseg pmaIterator) getInternalMappingsLocked() error {
return nil
}
-func (pseg pmaIterator) fileRange() platform.FileRange {
+func (pseg pmaIterator) fileRange() memmap.FileRange {
return pseg.fileRangeOf(pseg.Range())
}
// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
-func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
if checkInvariants {
if !pseg.Ok() {
panic("terminal pma iterator")
@@ -1032,5 +1031,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
pma := pseg.ValuePtr()
pstart := pseg.Start()
- return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+ return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 9ad52082d..4cdb52eb6 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
- "gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -35,7 +34,7 @@ type SpecialMappable struct {
refs.AtomicRefCount
mfp pgalloc.MemoryFileProvider
- fr platform.FileRange
+ fr memmap.FileRange
name string
}
@@ -44,15 +43,15 @@ type SpecialMappable struct {
// SpecialMappable will use the given name in /proc/[pid]/maps.
//
// Preconditions: fr.Length() != 0.
-func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
m := SpecialMappable{mfp: mfp, fr: fr, name: name}
m.EnableLeakCheck("mm.SpecialMappable")
return &m
}
// DecRef implements refs.RefCounter.DecRef.
-func (m *SpecialMappable) DecRef() {
- m.AtomicRefCount.DecRefWithDestructor(func() {
+func (m *SpecialMappable) DecRef(ctx context.Context) {
+ m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
m.mfp.MemoryFile().DecRef(m.fr)
})
}
@@ -126,7 +125,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
// store the SpecialMappable's contents.
-func (m *SpecialMappable) FileRange() platform.FileRange {
+func (m *SpecialMappable) FileRange() memmap.FileRange {
return m.fr
}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 3f496aa9f..e74d4e1c1 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -101,7 +101,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
if err != nil {
return 0, err
}
- defer m.DecRef()
+ defer m.DecRef(ctx)
opts.MappingIdentity = m
opts.Mappable = m
}
@@ -1191,7 +1191,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui
mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
mm.mappingMu.RUnlock()
err := id.Msync(ctx, mr)
- id.DecRef()
+ id.DecRef(ctx)
if err != nil {
return err
}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 16d8207e9..c4e1989ed 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -42,7 +42,12 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
Map32Bit: opts.Map32Bit,
})
if err != nil {
- return vmaIterator{}, usermem.AddrRange{}, err
+ // Can't force without opts.Unmap and opts.Fixed.
+ if opts.Force && opts.Unmap && opts.Fixed {
+ addr = opts.Addr
+ } else {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
}
ar, _ := addr.ToRange(opts.Length)
@@ -377,7 +382,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
}
if vma.id != nil {
- vma.id.DecRef()
+ vma.id.DecRef(ctx)
}
mm.usageAS -= uint64(vmaAR.Length())
if vma.isPrivateDataLocked() {
@@ -446,7 +451,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
}
if vma2.id != nil {
- vma2.id.DecRef()
+ vma2.id.DecRef(context.Background())
}
return vma1, true
}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 1eeb9f317..7a3311a70 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -33,21 +33,42 @@ go_template_instance(
out = "usage_set.go",
consts = {
"minDegree": "10",
+ "trackGaps": "1",
},
imports = {
- "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
},
package = "pgalloc",
prefix = "usage",
template = "//pkg/segment:generic_set",
types = {
"Key": "uint64",
- "Range": "platform.FileRange",
+ "Range": "memmap.FileRange",
"Value": "usageInfo",
"Functions": "usageSetFunctions",
},
)
+go_template_instance(
+ name = "reclaim_set",
+ out = "reclaim_set.go",
+ consts = {
+ "minDegree": "10",
+ },
+ imports = {
+ "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap",
+ },
+ package = "pgalloc",
+ prefix = "reclaim",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "memmap.FileRange",
+ "Value": "reclaimSetValue",
+ "Functions": "reclaimSetFunctions",
+ },
+)
+
go_library(
name = "pgalloc",
srcs = [
@@ -56,6 +77,7 @@ go_library(
"evictable_range_set.go",
"pgalloc.go",
"pgalloc_unsafe.go",
+ "reclaim_set.go",
"save_restore.go",
"usage_set.go",
],
@@ -67,9 +89,10 @@ go_library(
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/hostmm",
- "//pkg/sentry/platform",
+ "//pkg/sentry/memmap",
"//pkg/sentry/usage",
"//pkg/state",
+ "//pkg/state/wire",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b11ea4ae..46d3be58c 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -33,14 +33,14 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/hostmm"
- "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
-// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// MemoryFile is a memmap.File whose pages may be allocated to arbitrary
// users.
type MemoryFile struct {
// opts holds options passed to NewMemoryFile. opts is immutable.
@@ -108,12 +108,6 @@ type MemoryFile struct {
usageSwapped uint64
usageLast time.Time
- // minUnallocatedPage is the minimum page that may be unallocated.
- // i.e., there are no unallocated pages below minUnallocatedPage.
- //
- // minUnallocatedPage is protected by mu.
- minUnallocatedPage uint64
-
// fileSize is the size of the backing memory file in bytes. fileSize is
// always a power-of-two multiple of chunkSize.
//
@@ -146,11 +140,9 @@ type MemoryFile struct {
// is protected by mu.
reclaimable bool
- // minReclaimablePage is the minimum page that may be reclaimable.
- // i.e., all reclaimable pages are >= minReclaimablePage.
- //
- // minReclaimablePage is protected by mu.
- minReclaimablePage uint64
+ // relcaim is the collection of regions for reclaim. relcaim is protected
+ // by mu.
+ reclaim reclaimSet
// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
// transitions from false to true.
@@ -273,12 +265,10 @@ type evictableMemoryUserInfo struct {
}
const (
- chunkShift = 24
- chunkSize = 1 << chunkShift // 16 MB
+ chunkShift = 30
+ chunkSize = 1 << chunkShift // 1 GB
chunkMask = chunkSize - 1
- initialSize = chunkSize
-
// maxPage is the highest 64-bit page.
maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
)
@@ -302,19 +292,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
if err := file.Truncate(0); err != nil {
return nil, err
}
- if err := file.Truncate(initialSize); err != nil {
- return nil, err
- }
f := &MemoryFile{
- opts: opts,
- fileSize: initialSize,
- file: file,
- // No pages are reclaimable. DecRef will always be able to
- // decrease minReclaimablePage from this point.
- minReclaimablePage: maxPage,
- evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+ opts: opts,
+ file: file,
+ evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
}
- f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+ f.mappings.Store(make([]uintptr, 0))
f.reclaimCond.L = &f.mu
if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
@@ -389,7 +372,7 @@ func (f *MemoryFile) Destroy() {
// to Allocate.
//
// Preconditions: length must be page-aligned and non-zero.
-func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) {
if length == 0 || length%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid allocation length: %#x", length))
}
@@ -404,48 +387,38 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
alignment = usermem.HugePageSize
}
- start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
- end := start + length
- // File offsets are int64s. Since length must be strictly positive, end
- // cannot legitimately be 0.
- if end < start || int64(end) <= 0 {
- return platform.FileRange{}, syserror.ENOMEM
+ // Find a range in the underlying file.
+ fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
+ if !ok {
+ return memmap.FileRange{}, syserror.ENOMEM
}
- // Expand the file if needed. Double the file size on each expansion;
- // uncommitted pages have effectively no cost.
- fileSize := f.fileSize
- for int64(end) > fileSize {
- if fileSize >= 2*fileSize {
- // fileSize overflow.
- return platform.FileRange{}, syserror.ENOMEM
+ // Expand the file if needed.
+ if int64(fr.End) > f.fileSize {
+ // Round the new file size up to be chunk-aligned.
+ newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
+ if err := f.file.Truncate(newFileSize); err != nil {
+ return memmap.FileRange{}, err
}
- fileSize *= 2
- }
- if fileSize > f.fileSize {
- if err := f.file.Truncate(fileSize); err != nil {
- return platform.FileRange{}, err
- }
- f.fileSize = fileSize
+ f.fileSize = newFileSize
f.mappingsMu.Lock()
oldMappings := f.mappings.Load().([]uintptr)
- newMappings := make([]uintptr, fileSize>>chunkShift)
+ newMappings := make([]uintptr, newFileSize>>chunkShift)
copy(newMappings, oldMappings)
f.mappings.Store(newMappings)
f.mappingsMu.Unlock()
}
- // Mark selected pages as in use.
- fr := platform.FileRange{start, end}
if f.opts.ManualZeroing {
if err := f.forEachMappingSlice(fr, func(bs []byte) {
for i := range bs {
bs[i] = 0
}
}); err != nil {
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
}
+ // Mark selected pages as in use.
if !f.usage.Add(fr, usageInfo{
kind: kind,
refs: 1,
@@ -453,49 +426,79 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
}
- if minUnallocatedPage < start {
- f.minUnallocatedPage = minUnallocatedPage
- } else {
- // start was the first unallocated page. The next must be
- // somewhere beyond end.
- f.minUnallocatedPage = end
- }
-
return fr, nil
}
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
- // Only searched until the first page is found.
- firstPage := start
- foundFirstPage := false
- alignMask := alignment - 1
- for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
- r := seg.Range()
-
- if !foundFirstPage && r.Start > firstPage {
- foundFirstPage = true
+// findAvailableRange returns an available range in the usageSet.
+//
+// Note that scanning for available slots takes place from end first backwards,
+// then forwards. This heuristic has important consequence for how sequential
+// mappings can be merged in the host VMAs, given that addresses for both
+// application and sentry mappings are allocated top-down (from higher to
+// lower addresses). The file is also grown expoentially in order to create
+// space for mappings to be allocated downwards.
+//
+// Precondition: alignment must be a power of 2.
+func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
+ alignmentMask := alignment - 1
+
+ // Search for space in existing gaps, starting at the current end of the
+ // file and working backward.
+ lastGap := usage.LastGap()
+ gap := lastGap
+ for {
+ end := gap.End()
+ if end > uint64(fileSize) {
+ end = uint64(fileSize)
}
- if start >= r.End {
- // start was rounded up to an alignment boundary from the end
- // of a previous segment and is now beyond r.End.
- continue
+ // Try to allocate from the end of this gap, with the start of the
+ // allocated range aligned down to alignment.
+ unalignedStart := end - length
+ if unalignedStart > end {
+ // Negative overflow: this and all preceding gaps are too small to
+ // accommodate length.
+ break
+ }
+ if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
+ return memmap.FileRange{start, start + length}, true
}
- // This segment represents allocated or reclaimable pages; only the
- // range from start to the segment's beginning is allocatable, and the
- // next allocatable range begins after the segment.
- if r.Start > start && r.Start-start >= length {
+
+ gap = gap.PrevLargeEnoughGap(length)
+ if !gap.Ok() {
break
}
- start = (r.End + alignMask) &^ alignMask
- if !foundFirstPage {
- firstPage = r.End
+ }
+
+ // Check that it's possible to fit this allocation at the end of a file of any size.
+ min := lastGap.Start()
+ min = (min + alignmentMask) &^ alignmentMask
+ if min+length < min {
+ // Overflow: allocation would exceed the range of uint64.
+ return memmap.FileRange{}, false
+ }
+
+ // Determine the minimum file size required to fit this allocation at its end.
+ for {
+ newFileSize := 2 * fileSize
+ if newFileSize <= fileSize {
+ if fileSize != 0 {
+ // Overflow: allocation would exceed the range of int64.
+ return memmap.FileRange{}, false
+ }
+ newFileSize = chunkSize
+ }
+ fileSize = newFileSize
+
+ unalignedStart := uint64(fileSize) - length
+ if unalignedStart > uint64(fileSize) {
+ // Negative overflow: fileSize is still inadequate.
+ continue
+ }
+ if start := unalignedStart &^ alignmentMask; start >= min {
+ return memmap.FileRange{start, start + length}, true
}
}
- return start, firstPage
}
// AllocateAndFill allocates memory of the given kind and fills it by calling
@@ -505,22 +508,22 @@ func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uin
// by r.ReadToBlocks(), it returns that error.
//
// Preconditions: length > 0. length must be page-aligned.
-func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
fr, err := f.Allocate(length, kind)
if err != nil {
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
dsts, err := f.MapInternal(fr, usermem.Write)
if err != nil {
f.DecRef(fr)
- return platform.FileRange{}, err
+ return memmap.FileRange{}, err
}
n, err := safemem.ReadFullToBlocks(r, dsts)
un := uint64(usermem.Addr(n).RoundDown())
if un < length {
// Free unused memory and update fr to contain only the memory that is
// still allocated.
- f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+ f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
fr.End = fr.Start + un
}
return fr, err
@@ -537,7 +540,7 @@ const (
// will read zeroes.
//
// Preconditions: fr.Length() > 0.
-func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -557,7 +560,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error {
return nil
}
-func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
f.mu.Lock()
defer f.mu.Unlock()
// Since we're changing the knownCommitted attribute, we need to merge
@@ -578,8 +581,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
f.usage.MergeRange(fr)
}
-// IncRef implements platform.File.IncRef.
-func (f *MemoryFile) IncRef(fr platform.FileRange) {
+// IncRef implements memmap.File.IncRef.
+func (f *MemoryFile) IncRef(fr memmap.FileRange) {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -597,8 +600,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) {
f.usage.MergeAdjacent(fr)
}
-// DecRef implements platform.File.DecRef.
-func (f *MemoryFile) DecRef(fr platform.FileRange) {
+// DecRef implements memmap.File.DecRef.
+func (f *MemoryFile) DecRef(fr memmap.FileRange) {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -616,6 +619,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
}
val.refs--
if val.refs == 0 {
+ f.reclaim.Add(seg.Range(), reclaimSetValue{})
freed = true
// Reclassify memory as System, until it's freed by the reclaim
// goroutine.
@@ -628,17 +632,13 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
f.usage.MergeAdjacent(fr)
if freed {
- if fr.Start < f.minReclaimablePage {
- // We've freed at least one lower page.
- f.minReclaimablePage = fr.Start
- }
f.reclaimable = true
f.reclaimCond.Signal()
}
}
-// MapInternal implements platform.File.MapInternal.
-func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+// MapInternal implements memmap.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
if !fr.WellFormed() || fr.Length() == 0 {
panic(fmt.Sprintf("invalid range: %v", fr))
}
@@ -664,7 +664,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (
// forEachMappingSlice invokes fn on a sequence of byte slices that
// collectively map all bytes in fr.
-func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
mappings := f.mappings.Load().([]uintptr)
for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
chunk := int(chunkStart >> chunkShift)
@@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
continue
case !populated && populatedRun:
// Finish the run by changing this segment.
- runRange := platform.FileRange{
+ runRange := memmap.FileRange{
Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
End: r.Start + uint64(i*usermem.PageSize),
}
@@ -1009,7 +1009,7 @@ func (f *MemoryFile) File() *os.File {
return f.file
}
-// FD implements platform.File.FD.
+// FD implements memmap.File.FD.
func (f *MemoryFile) FD() int {
return int(f.file.Fd())
}
@@ -1030,6 +1030,7 @@ func (f *MemoryFile) String() string {
// for allocation.
func (f *MemoryFile) runReclaim() {
for {
+ // N.B. We must call f.markReclaimed on the returned FrameRange.
fr, ok := f.findReclaimable()
if !ok {
break
@@ -1085,13 +1086,17 @@ func (f *MemoryFile) runReclaim() {
}
}
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+// findReclaimable finds memory that has been marked for reclaim.
+//
+// Note that there returned range will be removed from tracking. It
+// must be reclaimed (removed from f.usage) at this point.
+func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
f.mu.Lock()
defer f.mu.Unlock()
for {
for {
if f.destroyed {
- return platform.FileRange{}, false
+ return memmap.FileRange{}, false
}
if f.reclaimable {
break
@@ -1103,27 +1108,24 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
}
f.reclaimCond.Wait()
}
- // Allocate returns the first usable range in offset order and is
- // currently a linear scan, so reclaiming from the beginning of the
- // file minimizes the expected latency of Allocate.
- for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
- if seg.ValuePtr().refs == 0 {
- f.minReclaimablePage = seg.End()
- return seg.Range(), true
- }
+ // Allocate works from the back of the file inwards, so reclaim
+ // preserves this order to minimize the cost of the search.
+ if seg := f.reclaim.LastSegment(); seg.Ok() {
+ fr := seg.Range()
+ f.reclaim.Remove(seg)
+ return fr, true
}
- // No pages are reclaimable.
+ // Nothing is reclaimable.
f.reclaimable = false
- f.minReclaimablePage = maxPage
}
}
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
f.mu.Lock()
defer f.mu.Unlock()
seg := f.usage.FindSegment(fr.Start)
- // All of fr should be mapped to a single uncommitted reclaimable segment
- // accounted to System.
+ // All of fr should be mapped to a single uncommitted reclaimable
+ // segment accounted to System.
if !seg.Ok() {
panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
}
@@ -1137,14 +1139,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
}); got != want {
panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
}
- // Deallocate reclaimed pages. Even though all of seg is reclaimable, the
- // caller of markReclaimed may not have decommitted it, so we can only mark
- // fr as reclaimed.
+ // Deallocate reclaimed pages. Even though all of seg is reclaimable,
+ // the caller of markReclaimed may not have decommitted it, so we can
+ // only mark fr as reclaimed.
f.usage.Remove(f.usage.Isolate(seg, fr))
- if fr.Start < f.minUnallocatedPage {
- // We've deallocated at least one lower page.
- f.minUnallocatedPage = fr.Start
- }
}
// StartEvictions requests that f evict all evictable allocations. It does not
@@ -1224,11 +1222,11 @@ func (usageSetFunctions) MaxKey() uint64 {
func (usageSetFunctions) ClearValue(val *usageInfo) {
}
-func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
return val1, val1 == val2
}
-func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
return val, val
}
@@ -1255,3 +1253,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal
func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
return evictableRangeSetValue{}, evictableRangeSetValue{}
}
+
+// reclaimSetValue is the value type of reclaimSet.
+type reclaimSetValue struct{}
+
+type reclaimSetFunctions struct{}
+
+func (reclaimSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+func (reclaimSetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
+}
+
+func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
+ return reclaimSetValue{}, true
+}
+
+func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
+ return reclaimSetValue{}, reclaimSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 293f22c6b..405db141f 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -23,39 +23,49 @@ import (
const (
page = usermem.PageSize
hugepage = usermem.HugePageSize
+ topPage = (1 << 63) - page
)
func TestFindUnallocatedRange(t *testing.T) {
for _, test := range []struct {
- desc string
- usage *usageSegmentDataSlices
- start uint64
- length uint64
- alignment uint64
- unallocated uint64
- minUnallocated uint64
+ desc string
+ usage *usageSegmentDataSlices
+ fileSize int64
+ length uint64
+ alignment uint64
+ start uint64
+ expectFail bool
}{
{
- desc: "Initial allocation succeeds",
- usage: &usageSegmentDataSlices{},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 0,
- minUnallocated: 0,
+ desc: "Initial allocation succeeds",
+ usage: &usageSegmentDataSlices{},
+ length: page,
+ alignment: page,
+ start: chunkSize - page, // Grows by chunkSize, allocate down.
},
{
- desc: "Allocation begins at start of file",
+ desc: "Allocation finds empty space at start of file",
usage: &usageSegmentDataSlices{
Start: []uint64{page},
End: []uint64{2 * page},
Values: []usageInfo{{refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 0,
- minUnallocated: 0,
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: 0,
+ },
+ {
+ desc: "Allocation finds empty space at end of file",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{0},
+ End: []uint64{page},
+ Values: []usageInfo{{refs: 1}},
+ },
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: page,
},
{
desc: "In-use frames are not allocatable",
@@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 2 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 2 * page,
+ length: page,
+ alignment: page,
+ start: 3 * page, // Double fileSize, allocate top-down.
},
{
desc: "Reclaimable frames are not allocatable",
@@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 2 * page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: 3 * page,
- minUnallocated: 3 * page,
+ fileSize: 3 * page,
+ length: page,
+ alignment: page,
+ start: 5 * page, // Double fileSize, grow down.
},
{
desc: "Gaps between in-use frames are allocatable",
@@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: page,
- alignment: page,
- unallocated: page,
- minUnallocated: page,
+ fileSize: 3 * page,
+ length: page,
+ alignment: page,
+ start: page,
},
{
desc: "Inadequately-sized gaps are rejected",
@@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, 3 * page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: 2 * page,
- alignment: page,
- unallocated: 3 * page,
- minUnallocated: page,
+ fileSize: 3 * page,
+ length: 2 * page,
+ alignment: page,
+ start: 4 * page, // Double fileSize, grow down.
},
{
- desc: "Hugepage alignment is honored",
+ desc: "Alignment is honored at end of file",
usage: &usageSegmentDataSlices{
Start: []uint64{0, hugepage + page},
// Hugepage-sized gap here that shouldn't be allocated from
@@ -118,37 +124,103 @@ func TestFindUnallocatedRange(t *testing.T) {
End: []uint64{page, hugepage + 2*page},
Values: []usageInfo{{refs: 1}, {refs: 1}},
},
- start: 0,
- length: hugepage,
- alignment: hugepage,
- unallocated: 2 * hugepage,
- minUnallocated: page,
+ fileSize: hugepage + 2*page,
+ length: hugepage,
+ alignment: hugepage,
+ start: 3 * hugepage, // Double fileSize until alignment is satisfied, grow down.
},
{
- desc: "Pages before start ignored",
+ desc: "Alignment is honored before end of file",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{0, 2*hugepage + page},
+ // Page will need to be shifted down from top.
+ End: []uint64{page, 2*hugepage + 2*page},
+ Values: []usageInfo{{refs: 1}, {refs: 1}},
+ },
+ fileSize: 2*hugepage + 2*page,
+ length: hugepage,
+ alignment: hugepage,
+ start: hugepage,
+ },
+ {
+ desc: "Allocation doubles file size more than once if necessary",
+ usage: &usageSegmentDataSlices{},
+ fileSize: page,
+ length: 4 * page,
+ alignment: page,
+ start: 0,
+ },
+ {
+ desc: "Allocations are compact if possible",
usage: &usageSegmentDataSlices{
Start: []uint64{page, 3 * page},
End: []uint64{2 * page, 4 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: page,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 4 * page,
+ length: page,
+ alignment: page,
+ start: 2 * page,
+ },
+ {
+ desc: "Top-down allocation within one gap",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, 4 * page, 7 * page},
+ End: []uint64{2 * page, 5 * page, 8 * page},
+ Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+ },
+ fileSize: 8 * page,
+ length: page,
+ alignment: page,
+ start: 6 * page,
+ },
+ {
+ desc: "Top-down allocation between multiple gaps",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, 3 * page, 5 * page},
+ End: []uint64{2 * page, 4 * page, 6 * page},
+ Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+ },
+ fileSize: 6 * page,
+ length: page,
+ alignment: page,
+ start: 4 * page,
},
{
- desc: "start may be in the middle of segment",
+ desc: "Top-down allocation with large top gap",
usage: &usageSegmentDataSlices{
- Start: []uint64{0, 3 * page},
+ Start: []uint64{page, 3 * page},
End: []uint64{2 * page, 4 * page},
Values: []usageInfo{{refs: 1}, {refs: 2}},
},
- start: page,
- length: page,
- alignment: page,
- unallocated: 2 * page,
- minUnallocated: 2 * page,
+ fileSize: 8 * page,
+ length: page,
+ alignment: page,
+ start: 7 * page,
+ },
+ {
+ desc: "Gaps found with possible overflow",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page, topPage - page},
+ End: []uint64{2 * page, topPage},
+ Values: []usageInfo{{refs: 1}, {refs: 1}},
+ },
+ fileSize: topPage,
+ length: page,
+ alignment: page,
+ start: topPage - 2*page,
+ },
+ {
+ desc: "Overflow detected",
+ usage: &usageSegmentDataSlices{
+ Start: []uint64{page},
+ End: []uint64{topPage},
+ Values: []usageInfo{{refs: 1}},
+ },
+ fileSize: topPage,
+ length: 2 * page,
+ alignment: page,
+ expectFail: true,
},
} {
t.Run(test.desc, func(t *testing.T) {
@@ -156,12 +228,18 @@ func TestFindUnallocatedRange(t *testing.T) {
if err := usage.ImportSortedSlices(test.usage); err != nil {
t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
}
- unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
- if unallocated != test.unallocated {
- t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+ fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment)
+ if !test.expectFail && !ok {
+ t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+ }
+ if test.expectFail && ok {
+ t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+ }
+ if ok && fr.Start != test.start {
+ t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
}
- if minUnallocated != test.minUnallocated {
- t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+ if ok && fr.End != test.start+test.length {
+ t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length)
}
})
}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index f8385c146..78317fa35 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -26,11 +26,12 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/state/wire"
"gvisor.dev/gvisor/pkg/usermem"
)
// SaveTo writes f's state to the given stream.
-func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
+func (f *MemoryFile) SaveTo(ctx context.Context, w wire.Writer) error {
// Wait for reclaim.
f.mu.Lock()
defer f.mu.Unlock()
@@ -79,10 +80,10 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
}
// Save metadata.
- if err := state.Save(ctx, w, &f.fileSize, nil); err != nil {
+ if _, err := state.Save(ctx, w, &f.fileSize); err != nil {
return err
}
- if err := state.Save(ctx, w, &f.usage, nil); err != nil {
+ if _, err := state.Save(ctx, w, &f.usage); err != nil {
return err
}
@@ -115,9 +116,9 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error {
}
// LoadFrom loads MemoryFile state from the given stream.
-func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
+func (f *MemoryFile) LoadFrom(ctx context.Context, r wire.Reader) error {
// Load metadata.
- if err := state.Load(ctx, r, &f.fileSize, nil); err != nil {
+ if _, err := state.Load(ctx, r, &f.fileSize); err != nil {
return err
}
if err := f.file.Truncate(f.fileSize); err != nil {
@@ -125,7 +126,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error {
}
newMappings := make([]uintptr, f.fileSize>>chunkShift)
f.mappings.Store(newMappings)
- if err := state.Load(ctx, r, &f.usage, nil); err != nil {
+ if _, err := state.Load(ctx, r, &f.usage); err != nil {
return err
}
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 453241eca..209b28053 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,39 +1,21 @@
load("//tools:defs.bzl", "go_library")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
-go_template_instance(
- name = "file_range",
- out = "file_range.go",
- package = "platform",
- prefix = "File",
- template = "//pkg/segment:generic_range",
- types = {
- "T": "uint64",
- },
-)
-
go_library(
name = "platform",
srcs = [
"context.go",
- "file_range.go",
"mmap_min_addr.go",
"platform.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
- "//pkg/atomicbitops",
"//pkg/context",
- "//pkg/log",
- "//pkg/safecopy",
- "//pkg/safemem",
"//pkg/seccomp",
"//pkg/sentry/arch",
- "//pkg/sentry/usage",
- "//pkg/syserror",
+ "//pkg/sentry/memmap",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..3970dd81d 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -6,8 +6,8 @@ go_library(
name = "kvm",
srcs = [
"address_space.go",
- "allocator.go",
"bluepill.go",
+ "bluepill_allocator.go",
"bluepill_amd64.go",
"bluepill_amd64.s",
"bluepill_amd64_unsafe.go",
@@ -41,12 +41,14 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/atomicbitops",
+ "//pkg/context",
"//pkg/cpuid",
"//pkg/log",
"//pkg/procid",
"//pkg/safecopy",
"//pkg/seccomp",
"//pkg/sentry/arch",
+ "//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
"//pkg/sentry/platform/ring0",
@@ -60,6 +62,7 @@ go_library(
go_test(
name = "kvm_test",
srcs = [
+ "kvm_amd64_test.go",
"kvm_test.go",
"virtual_map_test.go",
],
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index be213bfe8..af5c5e191 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/atomicbitops"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sync"
@@ -26,16 +27,15 @@ import (
// dirtySet tracks vCPUs for invalidation.
type dirtySet struct {
- vCPUs []uint64
+ vCPUMasks []uint64
}
// forEach iterates over all CPUs in the dirty set.
+//
+//go:nosplit
func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
- m.mu.RLock()
- defer m.mu.RUnlock()
-
- for index := range ds.vCPUs {
- mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+ for index := range ds.vCPUMasks {
+ mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
if mask != 0 {
for bit := 0; bit < 64; bit++ {
if mask&(1<<uint64(bit)) == 0 {
@@ -54,7 +54,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
index := uint64(c.id) / 64
bit := uint64(1) << uint(c.id%64)
- oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+ oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
if oldValue&bit != 0 {
return false // Not clean.
}
@@ -62,7 +62,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
// Set the bit unilaterally, and ensure that a flush takes place. Note
// that it's possible for races to occur here, but since the flush is
// taking place long after these lines there's no race in practice.
- atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+ atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
return true // Previously clean.
}
@@ -113,7 +113,12 @@ type hostMapEntry struct {
length uintptr
}
-func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+// mapLocked maps the given host entry.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
for m.length > 0 {
physical, length, ok := translateToPhysical(m.addr)
if !ok {
@@ -133,18 +138,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
// important; if the pagetable mappings were installed before
// ensuring the physical pages were available, then some other
// thread could theoretically access them.
- //
- // Due to the way KVM's shadow paging implementation works,
- // modifications to the page tables while in host mode may not
- // be trapped, leading to the shadow pages being out of sync.
- // Therefore, we need to ensure that we are in guest mode for
- // page table modifications. See the call to bluepill, below.
- as.machine.retryInGuest(func() {
- inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
- AccessType: at,
- User: true,
- }, physical) || inv
- })
+ inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+ AccessType: at,
+ User: true,
+ }, physical) || inv
m.addr += length
m.length -= length
addr += usermem.Addr(length)
@@ -154,7 +151,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
}
// MapFile implements platform.AddressSpace.MapFile.
-func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
as.mu.Lock()
defer as.mu.Unlock()
@@ -176,6 +173,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
return err
}
+ // See block in mapLocked.
+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+
// Map the mappings in the sentry's address space (guest physical memory)
// into the application's address space (guest virtual memory).
inv := false
@@ -190,7 +191,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
_ = s[i] // Touch to commit.
}
}
- prev := as.mapHost(addr, hostMapEntry{
+
+ // See bluepill_allocator.go.
+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+ // Perform the mapping.
+ prev := as.mapLocked(addr, hostMapEntry{
addr: b.Addr(),
length: uintptr(b.Len()),
}, at)
@@ -204,17 +210,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
return nil
}
+// unmapLocked is an escape-checked wrapped around Unmap.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool {
+ return as.pageTables.Unmap(addr, uintptr(length))
+}
+
// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
as.mu.Lock()
defer as.mu.Unlock()
- // See above re: retryInGuest.
- var prev bool
- as.machine.retryInGuest(func() {
- prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
- })
- if prev {
+ // See above & bluepill_allocator.go.
+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+ if prev := as.unmapLocked(addr, length); prev {
+ // Invalidate all active vCPUs.
as.invalidate()
// Recycle any freed intermediate pages.
@@ -227,8 +243,14 @@ func (as *addressSpace) Release() {
as.Unmap(0, ^uint64(0))
// Free all pages from the allocator.
- as.pageTables.Allocator.(allocator).base.Drain()
+ as.pageTables.Allocator.(*allocator).base.Drain()
// Drop all cached machine references.
as.machine.dropPageTables(as.pageTables)
}
+
+// PreFork implements platform.AddressSpace.PreFork.
+func (as *addressSpace) PreFork() {}
+
+// PostFork implements platform.AddressSpace.PostFork.
+func (as *addressSpace) PostFork() {}
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go
index 3f35414bb..9485e1301 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/bluepill_allocator.go
@@ -21,56 +21,80 @@ import (
)
type allocator struct {
- base *pagetables.RuntimeAllocator
+ base pagetables.RuntimeAllocator
+
+ // cpu must be set prior to any pagetable operation.
+ //
+ // Due to the way KVM's shadow paging implementation works,
+ // modifications to the page tables while in host mode may not be
+ // trapped, leading to the shadow pages being out of sync. Therefore,
+ // we need to ensure that we are in guest mode for page table
+ // modifications. See the call to bluepill, below.
+ cpu *vCPU
}
// newAllocator is used to define the allocator.
-func newAllocator() allocator {
- return allocator{
- base: pagetables.NewRuntimeAllocator(),
- }
+func newAllocator() *allocator {
+ a := new(allocator)
+ a.base.Init()
+ return a
}
// NewPTEs implements pagetables.Allocator.NewPTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) NewPTEs() *pagetables.PTEs {
- return a.base.NewPTEs()
+func (a *allocator) NewPTEs() *pagetables.PTEs {
+ ptes := a.base.NewPTEs() // escapes: bluepill below.
+ if a.cpu != nil {
+ bluepill(a.cpu)
+ }
+ return ptes
}
// PhysicalFor returns the physical address for a set of PTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
virtual := a.base.PhysicalFor(ptes)
physical, _, ok := translateToPhysical(virtual)
if !ok {
- panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+ panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic.
}
return physical
}
// LookupPTEs implements pagetables.Allocator.LookupPTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
if !ok {
- panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+ panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic.
}
return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
}
// FreePTEs implements pagetables.Allocator.FreePTEs.
//
+// +checkescape:all
+//
//go:nosplit
-func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
- a.base.FreePTEs(ptes)
+func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
+ a.base.FreePTEs(ptes) // escapes: bluepill below.
+ if a.cpu != nil {
+ bluepill(a.cpu)
+ }
}
// Recycle implements pagetables.Allocator.Recycle.
//
//go:nosplit
-func (a allocator) Recycle() {
+func (a *allocator) Recycle() {
a.base.Recycle()
}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 133c2203d..ddc1554d5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
// KernelSyscall handles kernel syscalls.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelSyscall() {
regs := c.Registers()
@@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() {
// We only trigger a bluepill entry in the bluepill function, and can
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt. We only worry about saving on exit.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+ ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+ ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
}
// KernelException handles kernel exceptions.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelException(vector ring0.Vector) {
regs := c.Registers()
@@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
regs.Rip = 0
}
// See above.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+ ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+ ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
}
// bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 99cac665d..03a98512e 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -17,6 +17,7 @@
package kvm
import (
+ "syscall"
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -53,3 +54,34 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
context.Rip = uint64(dieTrampolineAddr)
}
+
+// getHypercallID returns hypercall ID.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+ return _KVM_HYPERCALL_MAX
+}
+
+// bluepillStopGuest is reponsible for injecting interrupt.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+ // Interrupt: we must have requested an interrupt
+ // window; set the interrupt line.
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_INTERRUPT,
+ uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+ throw("interrupt injection failed")
+ }
+ // Clear previous injection request.
+ c.runData.requestInterruptWindow = 0
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+ return c.runData.readyForInterruptInjection != 0
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index c215d443c..ed5ae03d3 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -26,6 +26,17 @@ import (
var (
// The action for bluepillSignal is changed by sigaction().
bluepillSignal = syscall.SIGILL
+
+ // vcpuSErr is the event of system error.
+ vcpuSErr = kvmVcpuEvents{
+ exception: exception{
+ sErrPending: 1,
+ sErrHasEsr: 0,
+ pad: [6]uint8{0, 0, 0, 0, 0, 0},
+ sErrEsr: 1,
+ },
+ rsvd: [12]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ }
)
// bluepillArchEnter is called during bluepillEnter.
@@ -38,7 +49,7 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
regs.Sp = context.Sp
regs.Pc = context.Pc
regs.Pstate = context.Pstate
- regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+ regs.Pstate &^= uint64(ring0.PsrFlagsClear)
regs.Pstate |= ring0.KernelFlagsSet
return
}
@@ -52,7 +63,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
context.Sp = regs.Sp
context.Pc = regs.Pc
context.Pstate = regs.Pstate
- context.Pstate &^= uint64(ring0.UserFlagsClear)
+ context.Pstate &^= uint64(ring0.PsrFlagsClear)
context.Pstate |= ring0.UserFlagsSet
lazyVfp := c.GetLazyVFP()
@@ -66,6 +77,8 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
// KernelSyscall handles kernel syscalls.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelSyscall() {
regs := c.Registers()
@@ -88,6 +101,8 @@ func (c *vCPU) KernelSyscall() {
// KernelException handles kernel exceptions.
//
+// +checkescape:all
+//
//go:nosplit
func (c *vCPU) KernelException(vector ring0.Vector) {
regs := c.Registers()
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 4ca2b7717..b35c930e2 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,6 +17,7 @@
package kvm
import (
+ "syscall"
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -40,7 +41,7 @@ func fpsimdPtr(addr *byte) *arch.FpsimdContext {
func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
// If the vCPU is in user mode, we set the stack to the stored stack
// value in the vCPU itself. We don't want to unwind the user stack.
- if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+ if guestRegs.Regs.Pstate&ring0.PsrModeMask == ring0.UserFlagsSet {
regs := c.CPU.Registers()
context.Regs[0] = regs.Regs[0]
context.Sp = regs.Sp
@@ -61,3 +62,36 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
return &((*arch.SignalContext64)(context).Fpsimd64)
}
+
+// getHypercallID returns hypercall ID.
+//
+// On Arm64, the MMIO address should be 64-bit aligned.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+ if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) {
+ return _KVM_HYPERCALL_MAX
+ } else {
+ return int(((addr) - arm64HypercallMMIOBase) >> 3)
+ }
+}
+
+// bluepillStopGuest is reponsible for injecting sError.
+//
+//go:nosplit
+func bluepillStopGuest(c *vCPU) {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_VCPU_EVENTS,
+ uintptr(unsafe.Pointer(&vcpuSErr))); errno != 0 {
+ throw("sErr injection failed")
+ }
+}
+
+// bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection.
+//
+//go:nosplit
+func bluepillReadyStopGuest(c *vCPU) bool {
+ return true
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..bf357de1a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.15
+// +build !go1.16
// Check go:linkname function signatures when updating Go version.
@@ -58,12 +58,32 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
return &((*arch.UContext64)(context).MContext)
}
+// bluepillHandleHlt is reponsible for handling VM-Exit.
+//
+//go:nosplit
+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+ // Copy out registers.
+ bluepillArchExit(c, bluepillArchContext(context))
+
+ // Return to the vCPUReady state; notify any waiters.
+ user := atomic.LoadUint32(&c.state) & vCPUUser
+ switch atomic.SwapUint32(&c.state, user) {
+ case user | vCPUGuest: // Expected case.
+ case user | vCPUGuest | vCPUWaiter:
+ c.notify()
+ default:
+ throw("invalid state")
+ }
+}
+
// bluepillHandler is called from the signal stub.
//
// The world may be stopped while this is executing, and it executes on the
// signal stack. It should only execute raw system calls and functions that are
// explicitly marked go:nosplit.
//
+// +checkescape:all
+//
//go:nosplit
func bluepillHandler(context unsafe.Pointer) {
// Sanitize the registers; interrupts must always be disabled.
@@ -82,7 +102,8 @@ func bluepillHandler(context unsafe.Pointer) {
}
for {
- switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+ _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
+ switch errno {
case 0: // Expected case.
case syscall.EINTR:
// First, we process whatever pending signal
@@ -90,7 +111,7 @@ func bluepillHandler(context unsafe.Pointer) {
// currently, all signals are masked and the signal
// must have been delivered directly to this thread.
timeout := syscall.Timespec{}
- sig, _, errno := syscall.RawSyscall6(
+ sig, _, errno := syscall.RawSyscall6( // escapes: no.
syscall.SYS_RT_SIGTIMEDWAIT,
uintptr(unsafe.Pointer(&bounceSignalMask)),
0, // siginfo.
@@ -112,12 +133,12 @@ func bluepillHandler(context unsafe.Pointer) {
// PIC, we can't inject an interrupt while they are
// masked. We need to request a window if it's not
// ready.
- if c.runData.readyForInterruptInjection == 0 {
- c.runData.requestInterruptWindow = 1
- continue // Rerun vCPU.
- } else {
+ if bluepillReadyStopGuest(c) {
// Force injection below; the vCPU is ready.
c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+ } else {
+ c.runData.requestInterruptWindow = 1
+ continue // Rerun vCPU.
}
case syscall.EFAULT:
// If a fault is not serviceable due to the host
@@ -125,7 +146,7 @@ func bluepillHandler(context unsafe.Pointer) {
// MMIO exit we receive EFAULT from the run ioctl. We
// always inject an NMI here since we may be in kernel
// mode and have interrupts disabled.
- if _, _, errno := syscall.RawSyscall(
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_NMI, 0); errno != 0 {
@@ -156,25 +177,20 @@ func bluepillHandler(context unsafe.Pointer) {
c.die(bluepillArchContext(context), "debug")
return
case _KVM_EXIT_HLT:
- // Copy out registers.
- bluepillArchExit(c, bluepillArchContext(context))
-
- // Return to the vCPUReady state; notify any waiters.
- user := atomic.LoadUint32(&c.state) & vCPUUser
- switch atomic.SwapUint32(&c.state, user) {
- case user | vCPUGuest: // Expected case.
- case user | vCPUGuest | vCPUWaiter:
- c.notify()
- default:
- throw("invalid state")
- }
+ bluepillGuestExit(c, context)
return
case _KVM_EXIT_MMIO:
+ physical := uintptr(c.runData.data[0])
+ if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
+ bluepillGuestExit(c, context)
+ return
+ }
+
// Increment the fault count.
atomic.AddUint32(&c.faults, 1)
// For MMIO, the physical address is the first data item.
- physical := uintptr(c.runData.data[0])
+ physical = uintptr(c.runData.data[0])
virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
if !ok {
c.die(bluepillArchContext(context), "invalid physical address")
@@ -201,17 +217,7 @@ func bluepillHandler(context unsafe.Pointer) {
}
}
case _KVM_EXIT_IRQ_WINDOW_OPEN:
- // Interrupt: we must have requested an interrupt
- // window; set the interrupt line.
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_INTERRUPT,
- uintptr(unsafe.Pointer(&bounce))); errno != 0 {
- throw("interrupt injection failed")
- }
- // Clear previous injection request.
- c.runData.requestInterruptWindow = 0
+ bluepillStopGuest(c)
case _KVM_EXIT_SHUTDOWN:
c.die(bluepillArchContext(context), "shutdown")
return
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 6507121ea..6e6b76416 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,6 +15,7 @@
package kvm
import (
+ pkgcontext "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
@@ -37,7 +38,8 @@ type context struct {
}
// Switch runs the provided context in the given address space.
-func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ as := mm.AddressSpace()
localAS := as.(*addressSpace)
// Grab a vCPU.
@@ -88,3 +90,9 @@ func (c *context) Interrupt() {
// Release implements platform.Context.Release().
func (c *context) Release() {}
+
+// FullStateChanged implements platform.Context.FullStateChanged.
+func (c *context) FullStateChanged() {}
+
+// PullFullState implements platform.Context.PullFullState.
+func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go
new file mode 100644
index 000000000..c0b4fd374
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go
@@ -0,0 +1,51 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+func TestSegments(t *testing.T) {
+ applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
+ testutil.SetTestSegments(regs)
+ for {
+ var si arch.SignalInfo
+ if _, err := c.SwitchToUser(ring0.SwitchOpts{
+ Registers: regs,
+ FloatingPointState: dummyFPState,
+ PageTables: pt,
+ FullRestore: true,
+ }, &si); err == platform.ErrContextInterrupt {
+ continue // Retry.
+ } else if err != nil {
+ t.Errorf("application segment check with full restore got unexpected error: %v", err)
+ }
+ if err := testutil.CheckTestSegments(regs); err != nil {
+ t.Errorf("application segment check with full restore failed: %v", err)
+ }
+ break // Done.
+ }
+ return false
+ })
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 29d457a7e..0b06a923a 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -26,6 +26,9 @@ type kvmOneReg struct {
addr uint64
}
+// arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls.
+var arm64HypercallMMIOBase uintptr
+
const KVM_NR_SPSR = 5
type userFpsimdState struct {
@@ -43,6 +46,18 @@ type userRegs struct {
fpRegs userFpsimdState
}
+type exception struct {
+ sErrPending uint8
+ sErrHasEsr uint8
+ pad [6]uint8
+ sErrEsr uint64
+}
+
+type kvmVcpuEvents struct {
+ exception
+ rsvd [12]uint32
+}
+
// updateGlobalOnce does global initialization. It has to be called only once.
func updateGlobalOnce(fd int) error {
physicalInit()
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
index 6531bae1d..48ccf8474 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -22,7 +22,8 @@ import (
)
var (
- runDataSize int
+ runDataSize int
+ hasGuestPCID bool
)
func updateSystemValues(fd int) error {
@@ -33,6 +34,7 @@ func updateSystemValues(fd int) error {
}
// Save the data.
runDataSize = int(sz)
+ hasGuestPCID = true
// Success.
return nil
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..3bf918446 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -35,6 +35,8 @@ const (
_KVM_GET_SUPPORTED_CPUID = 0xc008ae05
_KVM_SET_CPUID2 = 0x4008ae90
_KVM_SET_SIGNAL_MASK = 0x4004ae8b
+ _KVM_GET_VCPU_EVENTS = 0x8040ae9f
+ _KVM_SET_VCPU_EVENTS = 0x4040aea0
)
// KVM exit reasons.
@@ -54,8 +56,10 @@ const (
// KVM capability options.
const (
- _KVM_CAP_MAX_VCPUS = 0x42
- _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
+ _KVM_CAP_MAX_VCPUS = 0x42
+ _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
+ _KVM_CAP_VCPU_EVENTS = 0x29
+ _KVM_CAP_ARM_INJECT_SERROR_ESR = 0x9e
)
// KVM limits.
@@ -71,3 +75,13 @@ const (
_KVM_MEM_READONLY = uint32(1) << 1
_KVM_MEM_FLAGS_NONE = 0
)
+
+// KVM hypercall list.
+// Canonical list of hypercalls supported.
+const (
+ // On amd64, it uses 'HLT' to leave the guest.
+ // Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
+ // _KVM_HYPERCALL_VMEXIT is only used on Arm64 for now.
+ _KVM_HYPERCALL_VMEXIT int = iota
+ _KVM_HYPERCALL_MAX
+)
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 531ae8b1e..9a7be3655 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -72,6 +72,7 @@ const (
_TCR_T0SZ_VA48 = 64 - 48 // VA=48
_TCR_T1SZ_VA48 = 64 - 48 // VA=48
+ _TCR_A1 = 1 << 22
_TCR_ASID16 = 1 << 36
_TCR_TBI0 = 1 << 37
@@ -116,6 +117,17 @@ const (
// Arm64: Exception Syndrome Register EL1.
const (
+ _ESR_ELx_EC_SHIFT = 26
+ _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT
+
+ _ESR_ELx_EC_IMP_DEF = 0x1f
+ _ESR_ELx_EC_IABT_LOW = 0x20
+ _ESR_ELx_EC_IABT_CUR = 0x21
+ _ESR_ELx_EC_PC_ALIGN = 0x22
+
+ _ESR_ELx_CM = 1 << 8
+ _ESR_ELx_WNR = 1 << 6
+
_ESR_ELx_FSC = 0x3F
_ESR_SEGV_MAPERR_L0 = 0x4
@@ -131,3 +143,10 @@ const (
_ESR_SEGV_PEMERR_L2 = 0xe
_ESR_SEGV_PEMERR_L3 = 0xf
)
+
+// Arm64: MMIO base address used to dispatch hypercalls.
+const (
+ // on Arm64, the MMIO address must be 64-bit aligned.
+ // Currently, we only need 1 hypercall: hypercall_vmexit.
+ _AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3
+)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 6c8f4fa28..45b3180f1 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -262,30 +262,6 @@ func TestRegistersFault(t *testing.T) {
})
}
-func TestSegments(t *testing.T) {
- applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
- testutil.SetTestSegments(regs)
- for {
- var si arch.SignalInfo
- if _, err := c.SwitchToUser(ring0.SwitchOpts{
- Registers: regs,
- FloatingPointState: dummyFPState,
- PageTables: pt,
- FullRestore: true,
- }, &si); err == platform.ErrContextInterrupt {
- continue // Retry.
- } else if err != nil {
- t.Errorf("application segment check with full restore got unexpected error: %v", err)
- }
- if err := testutil.CheckTestSegments(regs); err != nil {
- t.Errorf("application segment check with full restore failed: %v", err)
- }
- break // Done.
- }
- return false
- })
-}
-
func TestBounce(t *testing.T) {
applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
go func() {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..6c54712d1 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -52,16 +52,19 @@ type machine struct {
// available is notified when vCPUs are available.
available sync.Cond
- // vCPUs are the machine vCPUs.
+ // vCPUsByTID are the machine vCPUs.
//
// These are populated dynamically.
- vCPUs map[uint64]*vCPU
+ vCPUsByTID map[uint64]*vCPU
// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
- vCPUsByID map[int]*vCPU
+ vCPUsByID []*vCPU
// maxVCPUs is the maximum number of vCPUs supported by the machine.
maxVCPUs int
+
+ // nextID is the next vCPU ID.
+ nextID uint32
}
const (
@@ -137,9 +140,8 @@ type dieState struct {
//
// Precondition: mu must be held.
func (m *machine) newVCPU() *vCPU {
- id := len(m.vCPUs)
-
// Create the vCPU.
+ id := int(atomic.AddUint32(&m.nextID, 1) - 1)
fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
if errno != 0 {
panic(fmt.Sprintf("error creating new vCPU: %v", errno))
@@ -176,11 +178,7 @@ func (m *machine) newVCPU() *vCPU {
// newMachine returns a new VM context.
func newMachine(vm int) (*machine, error) {
// Create the machine.
- m := &machine{
- fd: vm,
- vCPUs: make(map[uint64]*vCPU),
- vCPUsByID: make(map[int]*vCPU),
- }
+ m := &machine{fd: vm}
m.available.L = &m.mu
m.kernel.Init(ring0.KernelOpts{
PageTables: pagetables.New(newAllocator()),
@@ -194,6 +192,10 @@ func newMachine(vm int) (*machine, error) {
}
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+ // Create the vCPUs map/slices.
+ m.vCPUsByTID = make(map[uint64]*vCPU)
+ m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -274,6 +276,8 @@ func newMachine(vm int) (*machine, error) {
// not available. This attempts to be efficient for calls in the hot path.
//
// This panics on error.
+//
+//go:nosplit
func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
for end := physical + length; physical < end; {
_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
@@ -304,7 +308,11 @@ func (m *machine) Destroy() {
runtime.SetFinalizer(m, nil)
// Destroy vCPUs.
- for _, c := range m.vCPUs {
+ for _, c := range m.vCPUsByID {
+ if c == nil {
+ continue
+ }
+
// Ensure the vCPU is not still running in guest mode. This is
// possible iff teardown has been done by other threads, and
// somehow a single thread has not executed any system calls.
@@ -337,7 +345,7 @@ func (m *machine) Get() *vCPU {
tid := procid.Current()
// Check for an exact match.
- if c := m.vCPUs[tid]; c != nil {
+ if c := m.vCPUsByTID[tid]; c != nil {
c.lock()
m.mu.RUnlock()
return c
@@ -356,7 +364,7 @@ func (m *machine) Get() *vCPU {
tid = procid.Current()
// Recheck for an exact match.
- if c := m.vCPUs[tid]; c != nil {
+ if c := m.vCPUsByTID[tid]; c != nil {
c.lock()
m.mu.Unlock()
return c
@@ -364,10 +372,10 @@ func (m *machine) Get() *vCPU {
for {
// Scan for an available vCPU.
- for origTID, c := range m.vCPUs {
+ for origTID, c := range m.vCPUsByTID {
if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
- delete(m.vCPUs, origTID)
- m.vCPUs[tid] = c
+ delete(m.vCPUsByTID, origTID)
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
@@ -375,17 +383,17 @@ func (m *machine) Get() *vCPU {
}
// Create a new vCPU (maybe).
- if len(m.vCPUs) < m.maxVCPUs {
+ if int(m.nextID) < m.maxVCPUs {
c := m.newVCPU()
c.lock()
- m.vCPUs[tid] = c
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
}
// Scan for something not in user mode.
- for origTID, c := range m.vCPUs {
+ for origTID, c := range m.vCPUsByTID {
if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
continue
}
@@ -403,8 +411,8 @@ func (m *machine) Get() *vCPU {
}
// Steal the vCPU.
- delete(m.vCPUs, origTID)
- m.vCPUs[tid] = c
+ delete(m.vCPUsByTID, origTID)
+ m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
return c
@@ -431,7 +439,7 @@ func (m *machine) Put(c *vCPU) {
// newDirtySet returns a new dirty set.
func (m *machine) newDirtySet() *dirtySet {
return &dirtySet{
- vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+ vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
}
}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 923ce3909..acc823ba6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -51,9 +51,10 @@ func (m *machine) initArchState() error {
recover()
debug.SetPanicOnFault(old)
}()
- m.retryInGuest(func() {
- ring0.SetCPUIDFaulting(true)
- })
+ c := m.Get()
+ defer m.Put(c)
+ bluepill(c)
+ ring0.SetCPUIDFaulting(true)
return nil
}
@@ -89,8 +90,8 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
defer m.mu.Unlock()
// Clear from all PCIDs.
- for _, c := range m.vCPUs {
- if c.PCIDs != nil {
+ for _, c := range m.vCPUsByID {
+ if c != nil && c.PCIDs != nil {
c.PCIDs.Drop(pt)
}
}
@@ -335,29 +336,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
}
}
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
- c := m.Get()
- defer m.Put(c)
- for {
- c.ClearErrorCode() // See below.
- bluepill(c) // Force guest mode.
- fn() // Execute the given function.
- _, user := c.ErrorCode()
- if user {
- // If user is set, then we haven't bailed back to host
- // mode via a kernel exception or system call. We
- // consider the full function to have executed in guest
- // mode and we can return.
- break
- }
- }
-}
-
// On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
// There is no need to return read-only physicalRegions.
func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..290f035dd 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -154,7 +154,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error {
//
//go:nosplit
func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
- if _, _, errno := syscall.RawSyscall(
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_GET_REGS,
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index e42505542..9db171af9 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -60,6 +60,12 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
if !vr.accessType.Write && vr.accessType.Read {
rdonlyRegions = append(rdonlyRegions, vr.region)
}
+
+ // TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated.
+ // Workaround: treated as rdonly temporarily.
+ if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute {
+ rdonlyRegions = append(rdonlyRegions, vr.region)
+ }
})
for _, r := range rdonlyRegions {
@@ -100,7 +106,7 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
defer m.mu.Unlock()
// Clear from all PCIDs.
- for _, c := range m.vCPUs {
+ for _, c := range m.vCPUsByID {
if c.PCIDs != nil {
c.PCIDs.Drop(pt)
}
@@ -119,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc
return usermem.NoAccess, platform.ErrContextSignal
}
+// isInstructionAbort returns true if it is an instruction abort.
+//
+//go:nosplit
+func isInstructionAbort(code uint64) bool {
+ value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT
+ return value == _ESR_ELx_EC_IABT_LOW
+}
+
+// isWriteFault returns whether it is a write fault.
+//
+//go:nosplit
+func isWriteFault(code uint64) bool {
+ if isInstructionAbort(code) {
+ return false
+ }
+
+ return (code & _ESR_ELx_WNR) != 0
+}
+
// fault generates an appropriate fault return.
//
//go:nosplit
func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+ bluepill(c) // Probably no-op, but may not be.
faultAddr := c.GetFaultAddr()
code, user := c.ErrorCode()
+ if !user {
+ // The last fault serviced by this CPU was not a user
+ // fault, so we can't reliably trust the faultAddr or
+ // the code provided here. We need to re-execute.
+ return usermem.NoAccess, platform.ErrContextInterrupt
+ }
+
// Reset the pointed SignalInfo.
*info = arch.SignalInfo{Signo: signal}
info.SetAddr(uint64(faultAddr))
- read := true
- write := false
- execute := true
-
ret := code & _ESR_ELx_FSC
switch ret {
case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
info.Code = 1 //SEGV_MAPERR
- read = false
- write = true
- execute = false
case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
info.Code = 2 // SEGV_ACCERR.
- read = true
- write = false
- execute = false
default:
info.Code = 2
}
- if !user {
- read = true
- write = false
- execute = true
-
- }
accessType := usermem.AccessType{
- Read: read,
- Write: write,
- Execute: execute,
+ Read: !isWriteFault(uint64(code)),
+ Write: isWriteFault(uint64(code)),
+ Execute: isInstructionAbort(uint64(code)),
}
return accessType, platform.ErrContextSignal
}
-
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
- c := m.Get()
- defer m.Put(c)
- for {
- c.ClearErrorCode() // See below.
- bluepill(c) // Force guest mode.
- fn() // Execute the given function.
- _, user := c.ErrorCode()
- if user {
- // If user is set, then we haven't bailed back to host
- // mode via a kernel exception or system call. We
- // consider the full function to have executed in guest
- // mode and we can return.
- break
- }
- }
-}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 3c02cef7c..905712076 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -60,7 +61,6 @@ func (c *vCPU) initArchState() error {
reg.addr = uint64(reflect.ValueOf(&data).Pointer())
regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
- vcpuInit.target = _KVM_ARM_TARGET_GENERIC_V8
vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
if _, _, errno := syscall.RawSyscall(
syscall.SYS_IOCTL,
@@ -78,19 +78,6 @@ func (c *vCPU) initArchState() error {
return err
}
- // sctlr_el1
- regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
- if err := c.getOneRegister(&regGet); err != nil {
- return err
- }
-
- dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
- data = dataGet
- reg.id = _KVM_ARM64_REGS_SCTLR_EL1
- if err := c.setOneRegister(&reg); err != nil {
- return err
- }
-
// tcr_el1
data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
reg.id = _KVM_ARM64_REGS_TCR_EL1
@@ -159,10 +146,16 @@ func (c *vCPU) initArchState() error {
return err
}
- data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
- reg.id = _KVM_ARM64_REGS_PSTATE
- if err := c.setOneRegister(&reg); err != nil {
- return err
+ // Use the address of the exception vector table as
+ // the MMIO address base.
+ arm64HypercallMMIOBase = toLocation
+
+ // Initialize the PCID database.
+ if hasGuestPCID {
+ // Note that NewPCIDs may return a nil table here, in which
+ // case we simply don't use PCID support (see below). In
+ // practice, this should not happen, however.
+ c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
}
c.floatingPointState = arch.NewFloatingPointData()
@@ -243,6 +236,13 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
}
+ // Assign PCIDs.
+ if c.PCIDs != nil {
+ var requireFlushPCID bool // Force a flush?
+ switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+ switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+ }
+
var vector ring0.Vector
ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
c.SetTtbr0App(uintptr(ttbr0App))
@@ -269,10 +269,18 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
case ring0.PageFault:
return c.fault(int32(syscall.SIGSEGV), info)
- case 0xaa:
- return usermem.NoAccess, nil
+ case ring0.Vector(bounce): // ring0.VirtualizationException
+ return usermem.NoAccess, platform.ErrContextInterrupt
+ case ring0.El0Sync_undef,
+ ring0.El1Sync_undef:
+ *info = arch.SignalInfo{
+ Signo: int32(syscall.SIGILL),
+ Code: 1, // ILL_ILLOPC (illegal opcode).
+ }
+ info.SetAddr(switchOpts.Registers.Pc) // Include address.
+ return usermem.AccessType{}, platform.ErrContextSignal
default:
- return usermem.NoAccess, platform.ErrContextSignal
+ panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
}
}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..9f86f6a7a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.15
+// +build !go1.16
// Check go:linkname function signatures when updating Go version.
@@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
//
//go:nosplit
func (c *vCPU) notify() {
- _, _, errno := syscall.RawSyscall6(
+ _, _, errno := syscall.RawSyscall6( // escapes: no.
syscall.SYS_FUTEX,
uintptr(unsafe.Pointer(&c.state)),
linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index ca902c8c1..4dad877ba 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -56,5 +56,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need)
}
}
+ // Check tls.
+ if need := ^uint64(11); regs.TPIDR_EL0 != need {
+ err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need)
+ }
return
}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 0bebee852..6caf7282d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -52,6 +52,8 @@ start:
TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
NO_LOCAL_POINTERS
+ // gc will touch fpsimd, so we should test it.
+ // such as in <runtime.deductSweepCredit>.
FMOVD $(9.9), F0
MOVD $SYS_GETPID, R8 // getpid
SVC
@@ -102,5 +104,15 @@ isNaN:
TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0
TWIDDLE_REGS()
+ MSR R10, TPIDR_EL0
+ // Trapped in el0_svc.
SVC
RET // never reached
+
+TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0
+ TWIDDLE_REGS()
+ MSR R10, TPIDR_EL0
+ // Trapped in el0_ia.
+ // Branch to Register branches unconditionally to an address in <Rn>.
+ JMP (R6) // <=> br x6, must fault
+ RET // never reached
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 171513f3f..ba031516a 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -22,9 +22,10 @@ import (
"os"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -114,6 +115,17 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
panic("This platform does not support CPU preemption detection")
}
+// MemoryManager represents an abstraction above the platform address space
+// which manages memory mappings and their contents.
+type MemoryManager interface {
+ //usermem.IO provides access to the contents of a virtual memory space.
+ usermem.IO
+ // MMap establishes a memory mapping.
+ MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error)
+ // AddressSpace returns the AddressSpace bound to mm.
+ AddressSpace() AddressSpace
+}
+
// Context represents the execution context for a single thread.
type Context interface {
// Switch resumes execution of the thread specified by the arch.Context
@@ -143,7 +155,36 @@ type Context interface {
// concurrent call to Switch().
//
// - ErrContextCPUPreempted: See the definition of that error for details.
- Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+ Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+
+ // PullFullState() pulls a full state of the application thread.
+ //
+ // A platform can support lazy loading/restoring of a thread state
+ // which includes registers and a floating point state.
+ //
+ // For example, when the Sentry handles a system call, it may have only
+ // syscall arguments without other registers and a floating point
+ // state. And in this case, if the Sentry will need to construct a
+ // signal frame to call a signal handler, it will need to call
+ // PullFullState() to load all registers and FPU state.
+ //
+ // Preconditions: The caller must be running on the task goroutine.
+ PullFullState(as AddressSpace, ac arch.Context)
+
+ // FullStateChanged() indicates that a thread state has been changed by
+ // the Sentry. This happens in case of the rt_sigreturn, execve, etc.
+ //
+ // First, it indicates that the Sentry has the full state of the thread
+ // and PullFullState() has to do nothing if it is called after
+ // FullStateChanged().
+ //
+ // Second, it forces restoring the full state of the application
+ // thread. A platform can support lazy loading/restoring of a thread
+ // state. This means that if the Sentry has not changed a thread state,
+ // the platform may not restore it.
+ //
+ // Preconditions: The caller must be running on the task goroutine.
+ FullStateChanged()
// Interrupt interrupts a concurrent call to Switch(), causing it to return
// ErrContextInterrupt.
@@ -207,7 +248,7 @@ type AddressSpace interface {
// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
// at.Any() == true. At least one reference must be held on all pages in
// fr, and must continue to be held as long as pages are mapped.
- MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error
+ MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
// Unmap unmaps the given range.
//
@@ -218,6 +259,13 @@ type AddressSpace interface {
// must be acquired via platform.NewAddressSpace().
Release()
+ // PreFork() is called before creating a copy of AddressSpace. This
+ // guarantees that this address space will be in a consistent state.
+ PreFork()
+
+ // PostFork() is called after creating a copy of AddressSpace.
+ PostFork()
+
// AddressSpaceIO methods are supported iff the associated platform's
// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
// does not hold may panic if AddressSpaceIO methods are invoked.
@@ -310,52 +358,6 @@ func (f SegmentationFault) Error() string {
return fmt.Sprintf("segmentation fault at %#x", f.Addr)
}
-// File represents a host file that may be mapped into an AddressSpace.
-type File interface {
- // All pages in a File are reference-counted.
-
- // IncRef increments the reference count on all pages in fr.
- //
- // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
- // 0. At least one reference must be held on all pages in fr. (The File
- // interface does not provide a way to acquire an initial reference;
- // implementors may define mechanisms for doing so.)
- IncRef(fr FileRange)
-
- // DecRef decrements the reference count on all pages in fr.
- //
- // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
- // 0. At least one reference must be held on all pages in fr.
- DecRef(fr FileRange)
-
- // MapInternal returns a mapping of the given file offsets in the invoking
- // process' address space for reading and writing.
- //
- // Note that fr.Start and fr.End need not be page-aligned.
- //
- // Preconditions: fr.Length() > 0. At least one reference must be held on
- // all pages in fr.
- //
- // Postconditions: The returned mapping is valid as long as at least one
- // reference is held on the mapped pages.
- MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
-
- // FD returns the file descriptor represented by the File.
- //
- // The only permitted operation on the returned file descriptor is to map
- // pages from it consistent with the requirements of AddressSpace.MapFile.
- FD() int
-}
-
-// FileRange represents a range of uint64 offsets into a File.
-//
-// type FileRange <generated using go_generics>
-
-// String implements fmt.Stringer.String.
-func (fr FileRange) String() string {
- return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
-}
-
// Requirements is used to specify platform specific requirements.
type Requirements struct {
// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 30402c2df..e04165fbf 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -24,12 +24,14 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/log",
"//pkg/procid",
"//pkg/safecopy",
"//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/hostcpu",
+ "//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
"//pkg/sync",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 08d055e05..b52d0fbd8 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -48,6 +48,7 @@ import (
"os"
"gvisor.dev/gvisor/pkg/abi/linux"
+ pkgcontext "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
@@ -95,7 +96,8 @@ type context struct {
}
// Switch runs the provided context in the given address space.
-func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ as := mm.AddressSpace()
s := as.(*subprocess)
isSyscall := s.switchToApp(c, ac)
@@ -180,6 +182,12 @@ func (c *context) Interrupt() {
// Release implements platform.Context.Release().
func (c *context) Release() {}
+// FullStateChanged implements platform.Context.FullStateChanged.
+func (c *context) FullStateChanged() {}
+
+// PullFullState implements platform.Context.PullFullState.
+func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
+
// PTrace represents a collection of ptrace subprocesses.
type PTrace struct {
platform.MMapMinAddr
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2389423b0..e1d54d8a2 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -616,7 +617,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
}
// MapFile implements platform.AddressSpace.MapFile.
-func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error {
var flags int
if precommit {
flags |= syscall.MAP_POPULATE
@@ -661,3 +662,9 @@ func (s *subprocess) Unmap(addr usermem.Addr, length uint64) {
panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
}
}
+
+// PreFork implements platform.AddressSpace.PreFork.
+func (s *subprocess) PreFork() {}
+
+// PostFork implements platform.AddressSpace.PostFork.
+func (s *subprocess) PostFork() {}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 2ae6b9f9d..0bee995e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.15
+// +build !go1.16
// Check go:linkname function signatures when updating Go version.
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 8122ac6e2..87a573cc4 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -26,30 +26,31 @@ const (
_PMD_PGT_SIZE = 0x4000
_PTE_PGT_BASE = 0x7000
_PTE_PGT_SIZE = 0x1000
-
- _PSR_D_BIT = 0x00000200
- _PSR_A_BIT = 0x00000100
- _PSR_I_BIT = 0x00000080
- _PSR_F_BIT = 0x00000040
)
const (
- // PSR bits
- PSR_MODE_EL0t = 0x00000000
- PSR_MODE_EL1t = 0x00000004
- PSR_MODE_EL1h = 0x00000005
- PSR_MODE_MASK = 0x0000000f
+ // DAIF bits:debug, sError, IRQ, FIQ.
+ _PSR_D_BIT = 0x00000200
+ _PSR_A_BIT = 0x00000100
+ _PSR_I_BIT = 0x00000080
+ _PSR_F_BIT = 0x00000040
+ _PSR_DAIF_SHIFT = 6
+ _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT
- // KernelFlagsSet should always be set in the kernel.
- KernelFlagsSet = PSR_MODE_EL1h
+ // PSR bits.
+ _PSR_MODE_EL0t = 0x00000000
+ _PSR_MODE_EL1t = 0x00000004
+ _PSR_MODE_EL1h = 0x00000005
+ _PSR_MODE_MASK = 0x0000000f
- // UserFlagsSet are always set in userspace.
- UserFlagsSet = PSR_MODE_EL0t
+ PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK
+ PsrModeMask = _PSR_MODE_MASK
- KernelFlagsClear = PSR_MODE_MASK
- UserFlagsClear = PSR_MODE_MASK
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
- PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
+ // UserFlagsSet are always set in userspace.
+ UserFlagsSet = _PSR_MODE_EL0t
)
// Vector is an exception vector.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index db6465663..9d29b7168 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -40,6 +40,20 @@
#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
+// sctlr_el1: system control register el1.
+#define SCTLR_M 1 << 0
+#define SCTLR_C 1 << 2
+#define SCTLR_I 1 << 12
+#define SCTLR_UCT 1 << 15
+
+#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT)
+
+// cntkctl_el1: counter-timer kernel control register el1.
+#define CNTKCTL_EL0PCTEN 1 << 0
+#define CNTKCTL_EL0VCTEN 1 << 1
+
+#define CNTKCTL_EL1_DEFAULT (CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN)
+
// Saves a register set.
//
// This is a macro because it may need to executed in contents where a stack is
@@ -298,12 +312,6 @@
ISB $15; \
DSB $15;
-#define IRQ_ENABLE \
- MSR $2, DAIFSet;
-
-#define IRQ_DISABLE \
- MSR $2, DAIFClr;
-
#define VFP_ENABLE \
MOVD $FPEN_ENABLE, R0; \
WORD $0xd5181040; \ //MSR R0, CPACR_EL1
@@ -356,15 +364,29 @@ TEXT ·Halt(SB),NOSPLIT,$0
CMP RSV_REG, R9
BNE mmio_exit
MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+
+ // Flush dcache.
+ WORD $0xd5087e52 // DC CISW
mmio_exit:
// Disable fpsimd.
WORD $0xd5381041 // MRS CPACR_EL1, R1
MOVD R1, CPU_LAZY_VFP(RSV_REG)
VFP_DISABLE
- // MMIO_EXIT.
- MOVD $0, R9
- MOVD R0, 0xffff000000001000(R9)
+ // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT.
+ //
+ // To keep it simple, I used the address of exception table as the
+ // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing
+ // a read-only space.
+ // Also, the length is engough to match a sufficient number of hypercall ID.
+ // Then, in host user space, I can calculate this address to find out
+ // which hypercall.
+ MRS VBAR_EL1, R9
+ MOVD R0, 0x0(R9)
+
+ // Flush dcahce.
+ WORD $0xd5087e52 // DC CISW
+
RET
// HaltAndResume halts execution and point the pointer to the resume function.
@@ -398,6 +420,7 @@ TEXT ·Current(SB),NOSPLIT,$0-8
// Prepare the vcpu environment for container application.
TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
// Step1, save sentry context into memory.
+ MRS TPIDR_EL1, RSV_REG
REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
@@ -409,34 +432,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3
- // Step2, save SP_EL1, PSTATE into kernel temporary stack.
- // switch to temporary stack.
+ // Step2, switch to temporary stack.
LOAD_KERNEL_STACK(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
- SUB $STACK_FRAME_SIZE, RSP, RSP
- MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11
- MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12
- STP (R11, R12), 16*0(RSP)
-
- MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11
- MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12
-
- // Step3, test user pagetable.
- // If user pagetable is empty, trapped in el1_ia.
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- SWITCH_TO_APP_PAGETABLE(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- SWITCH_TO_KVM_PAGETABLE(RSV_REG)
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
- // If pagetable is not empty, recovery kernel temporary stack.
- ADD $STACK_FRAME_SIZE, RSP, RSP
- // Step4, load app context pointer.
+ // Step3, load app context pointer.
MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP
- // Step5, prepare the environment for container application.
+ // Step4, prepare the environment for container application.
// set sp_el0.
MOVD PTRACE_SP(RSV_REG_APP), R1
WORD $0xd5184101 //MSR R1, SP_EL0
@@ -464,13 +466,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
ADD $STACK_FRAME_SIZE, RSP, RSP
+ ISB $15
ERET()
// kernelExitToEl1 is the entrypoint for sentry in guest_el1.
// Prepare the vcpu environment for sentry.
TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
WORD $0xd538d092 //MRS TPIDR_EL1, R18
-
MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1
WORD $0xd5184001 //MSR R1, SPSR_EL1
@@ -487,7 +489,15 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
// Start is the CPU entrypoint.
TEXT ·Start(SB),NOSPLIT,$0
- IRQ_DISABLE
+ // Flush dcache.
+ WORD $0xd5087e52 // DC CISW
+ // Init.
+ MOVD $SCTLR_EL1_DEFAULT, R1
+ MSR R1, SCTLR_EL1
+
+ MOVD $CNTKCTL_EL1_DEFAULT, R1
+ MSR R1, CNTKCTL_EL1
+
MOVD R8, RSV_REG
ORR $0xffff000000000000, RSV_REG, RSV_REG
WORD $0xd518d092 //MSR R18, TPIDR_EL1
@@ -536,6 +546,7 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
B el1_invalid
el1_da:
+el1_ia:
WORD $0xd538d092 //MRS TPIDR_EL1, R18
WORD $0xd538601a //MRS FAR_EL1, R26
@@ -548,9 +559,6 @@ el1_da:
B ·HaltAndResume(SB)
-el1_ia:
- B ·HaltAndResume(SB)
-
el1_sp_pc:
B ·Shutdown(SB)
@@ -622,9 +630,10 @@ el0_svc:
MOVD $Syscall, R3
MOVD R3, CPU_VECTOR_CODE(RSV_REG)
- B ·HaltAndResume(SB)
+ B ·kernelExitToEl1(SB)
el0_da:
+el0_ia:
WORD $0xd538d092 //MRS TPIDR_EL1, R18
WORD $0xd538601a //MRS FAR_EL1, R26
@@ -636,10 +645,10 @@ el0_da:
MOVD $PageFault, R3
MOVD R3, CPU_VECTOR_CODE(RSV_REG)
- B ·HaltAndResume(SB)
+ MRS ESR_EL1, R3
+ MOVD R3, CPU_ERROR_CODE(RSV_REG)
-el0_ia:
- B ·Shutdown(SB)
+ B ·kernelExitToEl1(SB)
el0_fpsimd_acc:
B ·Shutdown(SB)
@@ -654,7 +663,10 @@ el0_sp_pc:
B ·Shutdown(SB)
el0_undef:
- B ·Shutdown(SB)
+ MOVD $El0Sync_undef, R3
+ MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
+ B ·kernelExitToEl1(SB)
el0_dbg:
B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 900c0bba7..021693791 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -31,23 +31,39 @@ type defaultHooks struct{}
// KernelSyscall implements Hooks.KernelSyscall.
//
+// +checkescape:all
+//
//go:nosplit
-func (defaultHooks) KernelSyscall() { Halt() }
+func (defaultHooks) KernelSyscall() {
+ Halt()
+}
// KernelException implements Hooks.KernelException.
//
+// +checkescape:all
+//
//go:nosplit
-func (defaultHooks) KernelException(Vector) { Halt() }
+func (defaultHooks) KernelException(Vector) {
+ Halt()
+}
// kernelSyscall is a trampoline.
//
+// +checkescape:hard,stack
+//
//go:nosplit
-func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+func kernelSyscall(c *CPU) {
+ c.hooks.KernelSyscall()
+}
// kernelException is a trampoline.
//
+// +checkescape:hard,stack
+//
//go:nosplit
-func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+func kernelException(c *CPU, vector Vector) {
+ c.hooks.KernelException(vector)
+}
// Init initializes a new CPU.
//
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0feff8778..d37981dbf 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -178,6 +178,8 @@ func IsCanonical(addr uint64) bool {
//
// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
//
+// +checkescape:all
+//
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
@@ -192,9 +194,9 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
// Perform the switch.
swapgs() // GS will be swapped on return.
- WriteFS(uintptr(regs.Fs_base)) // Set application FS.
- WriteGS(uintptr(regs.Gs_base)) // Set application GS.
- LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+ WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
+ WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
+ LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
jumpToKernel() // Switch to upper half.
writeCR3(uintptr(userCR3)) // Change to user address space.
if switchOpts.FullRestore {
@@ -204,8 +206,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
}
writeCR3(uintptr(kernelCR3)) // Return to kernel address space.
jumpToUser() // Return to lower half.
- SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
- WriteFS(uintptr(c.registers.Fs_base)) // Restore kernel FS.
+ SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
+ WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
return
}
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index ccacaea6b..d0afa1aaa 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -53,14 +53,20 @@ func IsCanonical(addr uint64) bool {
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
- // Sanitize registers.
regs := switchOpts.Registers
- regs.Pstate &= ^uint64(UserFlagsClear)
+ regs.Pstate &= ^uint64(PsrFlagsClear)
regs.Pstate |= UserFlagsSet
+
+ LoadFloatingPoint(switchOpts.FloatingPointState)
+ SetTLS(regs.TPIDR_EL0)
+
kernelExitToEl0()
+
+ regs.TPIDR_EL0 = GetTLS()
+ SaveFloatingPoint(switchOpts.FloatingPointState)
+
vector = c.vecCode
- // Perform the switch.
return
}
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index a6345010d..00e52c8af 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -38,6 +38,12 @@ func SaveVRegs(*byte)
// LoadVRegs loads V0-V31 registers.
func LoadVRegs(*byte)
+// LoadFloatingPoint loads floating point state.
+func LoadFloatingPoint(*byte)
+
+// SaveFloatingPoint saves floating point state.
+func SaveFloatingPoint(*byte)
+
// GetTLS returns the value of TPIDR_EL0 register.
func GetTLS() (value uint64)
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index b63e14b41..86bfbe46f 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -129,3 +129,89 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
ISB $15
RET
+
+TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ MOVD 0(R0), R1
+ MOVD R1, FPSR
+ MOVD 8(R0), R1
+ MOVD R1, NZCV
+
+ FMOVD 16*1(R0), F0
+ FMOVD 16*2(R0), F1
+ FMOVD 16*3(R0), F2
+ FMOVD 16*4(R0), F3
+ FMOVD 16*5(R0), F4
+ FMOVD 16*6(R0), F5
+ FMOVD 16*7(R0), F6
+ FMOVD 16*8(R0), F7
+ FMOVD 16*9(R0), F8
+ FMOVD 16*10(R0), F9
+ FMOVD 16*11(R0), F10
+ FMOVD 16*12(R0), F11
+ FMOVD 16*13(R0), F12
+ FMOVD 16*14(R0), F13
+ FMOVD 16*15(R0), F14
+ FMOVD 16*16(R0), F15
+ FMOVD 16*17(R0), F16
+ FMOVD 16*18(R0), F17
+ FMOVD 16*19(R0), F18
+ FMOVD 16*20(R0), F19
+ FMOVD 16*21(R0), F20
+ FMOVD 16*22(R0), F21
+ FMOVD 16*23(R0), F22
+ FMOVD 16*24(R0), F23
+ FMOVD 16*25(R0), F24
+ FMOVD 16*26(R0), F25
+ FMOVD 16*27(R0), F26
+ FMOVD 16*28(R0), F27
+ FMOVD 16*29(R0), F28
+ FMOVD 16*30(R0), F29
+ FMOVD 16*31(R0), F30
+ FMOVD 16*32(R0), F31
+
+ RET
+
+TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ MOVD FPSR, R1
+ MOVD R1, 0(R0)
+ MOVD NZCV, R1
+ MOVD R1, 8(R0)
+
+ FMOVD F0, 16*1(R0)
+ FMOVD F1, 16*2(R0)
+ FMOVD F2, 16*3(R0)
+ FMOVD F3, 16*4(R0)
+ FMOVD F4, 16*5(R0)
+ FMOVD F5, 16*6(R0)
+ FMOVD F6, 16*7(R0)
+ FMOVD F7, 16*8(R0)
+ FMOVD F8, 16*9(R0)
+ FMOVD F9, 16*10(R0)
+ FMOVD F10, 16*11(R0)
+ FMOVD F11, 16*12(R0)
+ FMOVD F12, 16*13(R0)
+ FMOVD F13, 16*14(R0)
+ FMOVD F14, 16*15(R0)
+ FMOVD F15, 16*16(R0)
+ FMOVD F16, 16*17(R0)
+ FMOVD F17, 16*18(R0)
+ FMOVD F18, 16*19(R0)
+ FMOVD F19, 16*20(R0)
+ FMOVD F20, 16*21(R0)
+ FMOVD F21, 16*22(R0)
+ FMOVD F22, 16*23(R0)
+ FMOVD F23, 16*24(R0)
+ FMOVD F24, 16*25(R0)
+ FMOVD F25, 16*26(R0)
+ FMOVD F26, 16*27(R0)
+ FMOVD F27, 16*28(R0)
+ FMOVD F28, 16*29(R0)
+ FMOVD F29, 16*30(R0)
+ FMOVD F30, 16*31(R0)
+ FMOVD F31, 16*32(R0)
+
+ RET
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 23fd5c352..8d75b7599 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -53,9 +53,14 @@ type RuntimeAllocator struct {
// NewRuntimeAllocator returns an allocator that uses runtime allocation.
func NewRuntimeAllocator() *RuntimeAllocator {
- return &RuntimeAllocator{
- used: make(map[*PTEs]struct{}),
- }
+ r := new(RuntimeAllocator)
+ r.Init()
+ return r
+}
+
+// Init initializes a RuntimeAllocator.
+func (r *RuntimeAllocator) Init() {
+ r.used = make(map[*PTEs]struct{})
}
// Recycle returns freed pages to the pool.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 87e88e97d..7f18ac296 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true }
//
// Precondition: addr & length must be page-aligned, their sum must not overflow.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
if !opts.AccessType.Any() {
@@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// Precondition: addr & length must be page-aligned.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
w := unmapWalker{
@@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// Precondition: addr & length must be page-aligned.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
w := emptyWalker{
@@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false }
// Lookup returns the physical address for the given virtual address.
//
+// +checkescape:hard,stack
+//
//go:nosplit
func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 78510ebed..6409d1d91 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -72,13 +72,14 @@ const (
)
const (
- mtNormal = 0x4 << 2
+ mtDevicenGnRE = 0x1 << 2
+ mtNormal = 0x4 << 2
)
const (
executeDisable = xn
optionMask = 0xfff | 0xfff<<48
- protDefault = accessed | shared | mtNormal
+ protDefault = accessed | shared
)
// MapOpts are x86 options.
@@ -184,8 +185,10 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
if opts.User {
v |= user
+ v |= mtNormal
} else {
v = v &^ user
+ v |= mtDevicenGnRE // Strong order for the addresses with ring0.KernelStartAddress.
}
atomic.StoreUintptr((*uintptr)(p), v)
}
@@ -200,7 +203,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
// This should never happen.
panic("unaligned physical address!")
}
- v := addr | typeTable | protDefault
+ v := addr | typeTable | protDefault | mtNormal
atomic.StoreUintptr((*uintptr)(p), v)
}
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c40c6d673..c0fd3425b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -20,5 +20,6 @@ go_library(
"//pkg/syserr",
"//pkg/tcpip",
"//pkg/usermem",
+ "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 8b439a078..70ccf77a7 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
for _, fd := range fds {
file := t.GetFile(fd)
if file == nil {
- files.Release()
+ files.Release(t)
return nil, syserror.EBADF
}
files = append(files, file)
@@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (fs *RightsFiles) Release() {
+func (fs *RightsFiles) Release(ctx context.Context) {
for _, f := range *fs {
- f.DecRef()
+ f.DecRef(ctx)
}
*fs = nil
}
@@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32
fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{
CloseOnExec: cloexec,
})
- files[0].DecRef()
+ files[0].DecRef(t)
files = files[1:]
if err != nil {
t.Warningf("Error inserting FD: %v", err)
diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go
index fd08179be..d9621968c 100644
--- a/pkg/sentry/socket/control/control_vfs2.go
+++ b/pkg/sentry/socket/control/control_vfs2.go
@@ -46,7 +46,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) {
for _, fd := range fds {
file := t.GetFileVFS2(fd)
if file == nil {
- files.Release()
+ files.Release(t)
return nil, syserror.EBADF
}
files = append(files, file)
@@ -78,9 +78,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage {
}
// Release implements transport.RightsControlMessage.Release.
-func (fs *RightsFilesVFS2) Release() {
+func (fs *RightsFilesVFS2) Release(ctx context.Context) {
for _, f := range *fs {
- f.DecRef()
+ f.DecRef(ctx)
}
*fs = nil
}
@@ -93,7 +93,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int)
fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{
CloseOnExec: cloexec,
})
- files[0].DecRef()
+ files[0].DecRef(t)
files = files[1:]
if err != nil {
t.Warningf("Error inserting FD: %v", err)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e82d6cd1e..e76e498de 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -26,6 +26,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/hostfd",
"//pkg/sentry/inet",
@@ -39,6 +40,8 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "//tools/go_marshal/primitive",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c11e82c10..242e6bf76 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -36,6 +36,8 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
const (
@@ -98,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc
return nil, syserr.FromError(err)
}
dirent := socket.NewDirent(ctx, socketDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
+func (s *socketOpsCommon) Release(context.Context) {
fdnotifier.RemoveFD(int32(s.fd))
syscall.Close(s.fd)
}
@@ -267,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int,
syscall.Close(fd)
return 0, nil, 0, err
}
- defer f.DecRef()
+ defer f.DecRef(t)
kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
@@ -279,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int,
syscall.Close(fd)
return 0, nil, 0, err
}
- defer f.DecRef()
+ defer f.DecRef(t)
kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
@@ -319,12 +321,12 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
if outLen < 0 {
return nil, syserr.ErrInvalidArgument
}
- // Whitelist options and constrain option length.
+ // Only allow known and safe options.
optlen := getSockOptLen(t, level, name)
switch level {
case linux.SOL_IP:
@@ -364,12 +366,13 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
if err != nil {
return nil, syserr.FromError(err)
}
- return opt, nil
+ optP := primitive.ByteSlice(opt)
+ return &optP, nil
}
// SetSockOpt implements socket.Socket.SetSockOpt.
func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
- // Whitelist options and constrain option length.
+ // Only allow known and safe options.
optlen := setSockOptLen(t, level, name)
switch level {
case linux.SOL_IP:
@@ -415,7 +418,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []
// RecvMsg implements socket.Socket.RecvMsg.
func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
- // Whitelist flags.
+ // Only allow known and safe flags.
//
// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
// messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the
@@ -537,7 +540,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
// SendMsg implements socket.Socket.SendMsg.
func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
- // Whitelist flags.
+ // Only allow known and safe flags.
if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
return 0, syserr.ErrInvalidArgument
}
@@ -708,6 +711,6 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int
func init() {
for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
socket.RegisterProvider(family, &socketProvider{family})
- socket.RegisterProviderVFS2(family, &socketProviderVFS2{})
+ socket.RegisterProviderVFS2(family, &socketProviderVFS2{family})
}
}
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 677743113..8a1d52ebf 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -35,6 +36,7 @@ import (
type socketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
// We store metadata for hostinet sockets internally. Technically, we should
// access metadata (e.g. through stat, chmod) on the host for correctness,
@@ -59,6 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
fd: fd,
},
}
+ s.LockFD.Init(&vfs.FileLocks{})
if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
return nil, syserr.FromError(err)
}
@@ -68,6 +71,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
DenyPWrite: true,
UseDentryMetadata: true,
}); err != nil {
+ fdnotifier.RemoveFD(int32(s.fd))
return nil, syserr.FromError(err)
}
return vfsfd, nil
@@ -93,7 +97,12 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
return ioctl(ctx, s.fd, uio, args)
}
-// PRead implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.ENODEV
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.ESPIPE
}
@@ -131,6 +140,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
return int64(n), err
}
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
type socketProviderVFS2 struct {
family int
}
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index a48082631..fda3dcb35 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -53,6 +53,7 @@ type Stack struct {
interfaceAddrs map[int32][]inet.InterfaceAddr
routes []inet.Route
supportsIPv6 bool
+ tcpRecovery inet.TCPLossRecovery
tcpRecvBufSize inet.TCPBufferSize
tcpSendBufSize inet.TCPBufferSize
tcpSACKEnabled bool
@@ -350,6 +351,16 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
return syserror.EACCES
}
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ return s.tcpRecovery, nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+ return syserror.EACCES
+}
+
// getLine reads one line from proc file, with specified prefix.
// The last argument, withHeader, specifies if it contains line header.
func getLine(f *os.File, prefix string, withHeader bool) string {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 789bb94c8..e91b0624c 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -32,28 +32,6 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// errorTargetName is used to mark targets as error targets. Error targets
-// shouldn't be reached - an error has occurred if we fall through to one.
-const errorTargetName = "ERROR"
-
-// redirectTargetName is used to mark targets as redirect targets. Redirect
-// targets should be reached for only NAT and Mangle tables. These targets will
-// change the destination port/destination IP for packets.
-const redirectTargetName = "REDIRECT"
-
-// Metadata is used to verify that we are correctly serializing and
-// deserializing iptables into structs consumable by the iptables tool. We save
-// a metadata struct when the tables are written, and when they are read out we
-// verify that certain fields are the same.
-//
-// metadata is used by this serialization/deserializing code, not netstack.
-type metadata struct {
- HookEntry [linux.NF_INET_NUMHOOKS]uint32
- Underflow [linux.NF_INET_NUMHOOKS]uint32
- NumEntries uint32
- Size uint32
-}
-
// enableLogging controls whether to log the (de)serialization of netfilter
// structs between userspace and netstack. These logs are useful when
// developing iptables, but can pollute sentry logs otherwise.
@@ -64,6 +42,8 @@ const enableLogging = false
var emptyFilter = stack.IPHeaderFilter{
Dst: "\x00\x00\x00\x00",
DstMask: "\x00\x00\x00\x00",
+ Src: "\x00\x00\x00\x00",
+ SrcMask: "\x00\x00\x00\x00",
}
// nflog logs messages related to the writing and reading of iptables.
@@ -77,33 +57,17 @@ func nflog(format string, args ...interface{}) {
func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
// Read in the struct and table name.
var info linux.IPTGetinfo
- if _, err := t.CopyIn(outPtr, &info); err != nil {
+ if _, err := info.CopyIn(t, outPtr); err != nil {
return linux.IPTGetinfo{}, syserr.FromError(err)
}
- // Find the appropriate table.
- table, err := findTable(stack, info.Name)
+ _, info, err := convertNetstackToBinary(stack, info.Name)
if err != nil {
- nflog("%v", err)
+ nflog("couldn't convert iptables: %v", err)
return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
}
- // Get the hooks that apply to this table.
- info.ValidHooks = table.ValidHooks()
-
- // Grab the metadata struct, which is used to store information (e.g.
- // the number of entries) that applies to the user's encoding of
- // iptables, but not netstack's.
- metadata := table.Metadata().(metadata)
-
- // Set values from metadata.
- info.HookEntry = metadata.HookEntry
- info.Underflow = metadata.Underflow
- info.NumEntries = metadata.NumEntries
- info.Size = metadata.Size
-
nflog("returning info: %+v", info)
-
return info, nil
}
@@ -111,28 +75,18 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
// Read in the struct and table name.
var userEntries linux.IPTGetEntries
- if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+ if _, err := userEntries.CopyIn(t, outPtr); err != nil {
nflog("couldn't copy in entries %q", userEntries.Name)
return linux.KernelIPTGetEntries{}, syserr.FromError(err)
}
- // Find the appropriate table.
- table, err := findTable(stack, userEntries.Name)
- if err != nil {
- nflog("%v", err)
- return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
- }
-
// Convert netstack's iptables rules to something that the iptables
// tool can understand.
- entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table)
+ entries, _, err := convertNetstackToBinary(stack, userEntries.Name)
if err != nil {
nflog("couldn't read entries: %v", err)
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
}
- if meta != table.Metadata().(metadata) {
- panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta))
- }
if binary.Size(entries) > uintptr(outLen) {
nflog("insufficient GetEntries output size: %d", uintptr(outLen))
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -141,48 +95,26 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
return entries, nil
}
-func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
- ipt := stk.IPTables()
- table, ok := ipt.Tables[tablename.String()]
- if !ok {
- return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
- }
- return table, nil
-}
-
-// FillDefaultIPTables sets stack's IPTables to the default tables and
-// populates them with metadata.
-func FillDefaultIPTables(stk *stack.Stack) {
- ipt := stack.DefaultTables()
-
- // In order to fill in the metadata, we have to translate ipt from its
- // netstack format to Linux's giant-binary-blob format.
- for name, table := range ipt.Tables {
- _, metadata, err := convertNetstackToBinary(name, table)
- if err != nil {
- panic(fmt.Errorf("Unable to set default IP tables: %v", err))
- }
- table.SetMetadata(metadata)
- ipt.Tables[name] = table
- }
-
- stk.SetIPTables(ipt)
-}
-
// convertNetstackToBinary converts the iptables as stored in netstack to the
// format expected by the iptables tool. Linux stores each table as a binary
// blob that can only be traversed by parsing a bit, reading some offsets,
// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
- // Return values.
+func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+ table, ok := stack.IPTables().GetTable(tablename.String())
+ if !ok {
+ return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+ }
+
var entries linux.KernelIPTGetEntries
- var meta metadata
+ var info linux.IPTGetinfo
+ info.ValidHooks = table.ValidHooks()
// The table name has to fit in the struct.
if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
- return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("table name %q too long.", tablename)
+ return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
}
- copy(entries.Name[:], tablename)
+ copy(info.Name[:], tablename[:])
+ copy(entries.Name[:], tablename[:])
for ruleIdx, rule := range table.Rules {
nflog("convert to binary: current offset: %d", entries.Size)
@@ -191,20 +123,20 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
for hook, hookRuleIdx := range table.BuiltinChains {
if hookRuleIdx == ruleIdx {
nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
- meta.HookEntry[hook] = entries.Size
+ info.HookEntry[hook] = entries.Size
}
}
// Is this a chain underflow point?
for underflow, underflowRuleIdx := range table.Underflows {
if underflowRuleIdx == ruleIdx {
nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
- meta.Underflow[underflow] = entries.Size
+ info.Underflow[underflow] = entries.Size
}
}
// Each rule corresponds to an entry.
entry := linux.KernelIPTEntry{
- IPTEntry: linux.IPTEntry{
+ Entry: linux.IPTEntry{
IP: linux.IPTIP{
Protocol: uint16(rule.Filter.Protocol),
},
@@ -212,15 +144,20 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
TargetOffset: linux.SizeOfIPTEntry,
},
}
- copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst)
- copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask)
- copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface)
- copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+ copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+ copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+ copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+ copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+ copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+ copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
if rule.Filter.DstInvert {
- entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+ }
+ if rule.Filter.SrcInvert {
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
}
if rule.Filter.OutputInterfaceInvert {
- entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+ entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
}
for _, matcher := range rule.Matchers {
@@ -232,8 +169,8 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
}
entry.Elems = append(entry.Elems, serialized...)
- entry.NextOffset += uint16(len(serialized))
- entry.TargetOffset += uint16(len(serialized))
+ entry.Entry.NextOffset += uint16(len(serialized))
+ entry.Entry.TargetOffset += uint16(len(serialized))
}
// Serialize and append the target.
@@ -242,142 +179,18 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
}
entry.Elems = append(entry.Elems, serialized...)
- entry.NextOffset += uint16(len(serialized))
+ entry.Entry.NextOffset += uint16(len(serialized))
nflog("convert to binary: adding entry: %+v", entry)
- entries.Size += uint32(entry.NextOffset)
+ entries.Size += uint32(entry.Entry.NextOffset)
entries.Entrytable = append(entries.Entrytable, entry)
- meta.NumEntries++
- }
-
- nflog("convert to binary: finished with an marshalled size of %d", meta.Size)
- meta.Size = entries.Size
- return entries, meta, nil
-}
-
-func marshalTarget(target stack.Target) []byte {
- switch tg := target.(type) {
- case stack.AcceptTarget:
- return marshalStandardTarget(stack.RuleAccept)
- case stack.DropTarget:
- return marshalStandardTarget(stack.RuleDrop)
- case stack.ErrorTarget:
- return marshalErrorTarget(errorTargetName)
- case stack.UserChainTarget:
- return marshalErrorTarget(tg.Name)
- case stack.ReturnTarget:
- return marshalStandardTarget(stack.RuleReturn)
- case stack.RedirectTarget:
- return marshalRedirectTarget(tg)
- case JumpTarget:
- return marshalJumpTarget(tg)
- default:
- panic(fmt.Errorf("unknown target of type %T", target))
- }
-}
-
-func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
- nflog("convert to binary: marshalling standard target")
-
- // The target's name will be the empty string.
- target := linux.XTStandardTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTStandardTarget,
- },
- Verdict: translateFromStandardVerdict(verdict),
+ info.NumEntries++
}
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalErrorTarget(errorName string) []byte {
- // This is an error target named error
- target := linux.XTErrorTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTErrorTarget,
- },
- }
- copy(target.Name[:], errorName)
- copy(target.Target.Name[:], errorTargetName)
-
- ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
- // This is a redirect target named redirect
- target := linux.XTRedirectTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTRedirectTarget,
- },
- }
- copy(target.Target.Name[:], redirectTargetName)
-
- ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
- target.NfRange.RangeSize = 1
- if rt.RangeProtoSpecified {
- target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
- }
- // Convert port from little endian to big endian.
- port := make([]byte, 2)
- binary.LittleEndian.PutUint16(port, rt.MinPort)
- target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
- binary.LittleEndian.PutUint16(port, rt.MaxPort)
- target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-func marshalJumpTarget(jt JumpTarget) []byte {
- nflog("convert to binary: marshalling jump target")
-
- // The target's name will be the empty string.
- target := linux.XTStandardTarget{
- Target: linux.XTEntryTarget{
- TargetSize: linux.SizeOfXTStandardTarget,
- },
- // Verdict is overloaded by the ABI. When positive, it holds
- // the jump offset from the start of the table.
- Verdict: int32(jt.Offset),
- }
-
- ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
- return binary.Marshal(ret, usermem.ByteOrder, target)
-}
-
-// translateFromStandardVerdict translates verdicts the same way as the iptables
-// tool.
-func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
- switch verdict {
- case stack.RuleAccept:
- return -linux.NF_ACCEPT - 1
- case stack.RuleDrop:
- return -linux.NF_DROP - 1
- case stack.RuleReturn:
- return linux.NF_RETURN
- default:
- // TODO(gvisor.dev/issue/170): Support Jump.
- panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
- }
-}
-
-// translateToStandardTarget translates from the value in a
-// linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32) (stack.Target, error) {
- // TODO(gvisor.dev/issue/170): Support other verdicts.
- switch val {
- case -linux.NF_ACCEPT - 1:
- return stack.AcceptTarget{}, nil
- case -linux.NF_DROP - 1:
- return stack.DropTarget{}, nil
- case -linux.NF_QUEUE - 1:
- return nil, errors.New("unsupported iptables verdict QUEUE")
- case linux.NF_RETURN:
- return stack.ReturnTarget{}, nil
- default:
- return nil, fmt.Errorf("unknown iptables verdict %d", val)
- }
+ nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+ info.Size = entries.Size
+ return entries, info, nil
}
// SetEntries sets iptables rules for a single table. See
@@ -396,10 +209,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// TODO(gvisor.dev/issue/170): Support other tables.
var table stack.Table
switch replace.Name.String() {
- case stack.TablenameFilter:
+ case stack.FilterTable:
table = stack.EmptyFilterTable()
- case stack.TablenameNat:
- table = stack.EmptyNatTable()
+ case stack.NATTable:
+ table = stack.EmptyNATTable()
default:
nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
return syserr.ErrInvalidArgument
@@ -485,6 +298,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
for hook, _ := range replace.HookEntry {
if table.ValidHooks()&(1<<hook) != 0 {
hk := hookFromLinux(hook)
+ table.BuiltinChains[hk] = stack.HookUnset
+ table.Underflows[hk] = stack.HookUnset
for offset, ruleIdx := range offsets {
if offset == replace.HookEntry[hook] {
table.BuiltinChains[hk] = ruleIdx
@@ -510,8 +325,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// Add the user chains.
for ruleIdx, rule := range table.Rules {
- target, ok := rule.Target.(stack.UserChainTarget)
- if !ok {
+ if _, ok := rule.Target.(stack.UserChainTarget); !ok {
continue
}
@@ -527,7 +341,6 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
nflog("user chain's first node must have no matchers")
return syserr.ErrInvalidArgument
}
- table.UserChains[target.Name] = ruleIdx + 1
}
// Set each jump to point to the appropriate rule. Right now they hold byte
@@ -553,7 +366,10 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now,
// make sure all other chains point to ACCEPT rules.
for hook, ruleIdx := range table.BuiltinChains {
- if hook == stack.Forward || hook == stack.Postrouting {
+ if hook := stack.Hook(hook); hook == stack.Forward || hook == stack.Postrouting {
+ if ruleIdx == stack.HookUnset {
+ continue
+ }
if !isUnconditionalAccept(table.Rules[ruleIdx]) {
nflog("hook %d is unsupported.", hook)
return syserr.ErrInvalidArgument
@@ -566,17 +382,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// - There are no chains without an unconditional final rule.
// - There are no chains without an unconditional underflow rule.
- ipt := stk.IPTables()
- table.SetMetadata(metadata{
- HookEntry: replace.HookEntry,
- Underflow: replace.Underflow,
- NumEntries: replace.NumEntries,
- Size: replace.Size,
- })
- ipt.Tables[replace.Name.String()] = table
- stk.SetIPTables(ipt)
-
- return nil
+ return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table))
}
// parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -623,113 +429,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
return matchers, nil
}
-// parseTarget parses a target from optVal. optVal should contain only the
-// target.
-func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
- nflog("set entries: parsing target of size %d", len(optVal))
- if len(optVal) < linux.SizeOfXTEntryTarget {
- return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
- }
- var target linux.XTEntryTarget
- buf := optVal[:linux.SizeOfXTEntryTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &target)
- switch target.Name.String() {
- case "":
- // Standard target.
- if len(optVal) != linux.SizeOfXTStandardTarget {
- return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
- }
- var standardTarget linux.XTStandardTarget
- buf = optVal[:linux.SizeOfXTStandardTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
-
- if standardTarget.Verdict < 0 {
- // A Verdict < 0 indicates a non-jump verdict.
- return translateToStandardTarget(standardTarget.Verdict)
- }
- // A verdict >= 0 indicates a jump.
- return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
-
- case errorTargetName:
- // Error target.
- if len(optVal) != linux.SizeOfXTErrorTarget {
- return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
- }
- var errorTarget linux.XTErrorTarget
- buf = optVal[:linux.SizeOfXTErrorTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
-
- // Error targets are used in 2 cases:
- // * An actual error case. These rules have an error
- // named errorTargetName. The last entry of the table
- // is usually an error case to catch any packets that
- // somehow fall through every rule.
- // * To mark the start of a user defined chain. These
- // rules have an error with the name of the chain.
- switch name := errorTarget.Name.String(); name {
- case errorTargetName:
- nflog("set entries: error target")
- return stack.ErrorTarget{}, nil
- default:
- // User defined chain.
- nflog("set entries: user-defined target %q", name)
- return stack.UserChainTarget{Name: name}, nil
- }
-
- case redirectTargetName:
- // Redirect target.
- if len(optVal) < linux.SizeOfXTRedirectTarget {
- return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
- }
-
- if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- var redirectTarget linux.XTRedirectTarget
- buf = optVal[:linux.SizeOfXTRedirectTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
-
- // Copy linux.XTRedirectTarget to stack.RedirectTarget.
- var target stack.RedirectTarget
- nfRange := redirectTarget.NfRange
-
- // RangeSize should be 1.
- if nfRange.RangeSize != 1 {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- // TODO(gvisor.dev/issue/170): Check if the flags are valid.
- // Also check if we need to map ports or IP.
- // For now, redirect target only supports destination port change.
- // Port range and IP range are not supported yet.
- if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
- target.RangeProtoSpecified = true
-
- target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
- target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
-
- // TODO(gvisor.dev/issue/170): Port range is not supported yet.
- if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
- return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
- }
-
- // Convert port from big endian to little endian.
- port := make([]byte, 2)
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
- target.MinPort = binary.LittleEndian.Uint16(port)
-
- binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
- target.MaxPort = binary.LittleEndian.Uint16(port)
- return target, nil
- }
-
- // Unknown target.
- return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
-}
-
func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
if containsUnsupportedFields(iptip) {
return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
@@ -737,6 +436,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
}
+ if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+ return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+ }
n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
if n == -1 {
@@ -755,6 +457,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
Dst: tcpip.Address(iptip.Dst[:]),
DstMask: tcpip.Address(iptip.DstMask[:]),
DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+ Src: tcpip.Address(iptip.Src[:]),
+ SrcMask: tcpip.Address(iptip.SrcMask[:]),
+ SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
OutputInterface: ifname,
OutputInterfaceMask: ifnameMask,
OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
@@ -765,15 +470,13 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
// The following features are supported:
// - Protocol
// - Dst and DstMask
+ // - Src and SrcMask
// - The inverse destination IP check flag
// - OutputInterface, OutputInterfaceMask and its inverse.
- var emptyInetAddr = linux.InetAddr{}
var emptyInterface = [linux.IFNAMSIZ]byte{}
// Disable any supported inverse flags.
- inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_VIA_OUT)
- return iptip.Src != emptyInetAddr ||
- iptip.SrcMask != emptyInetAddr ||
- iptip.InputInterface != emptyInterface ||
+ inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+ return iptip.InputInterface != emptyInterface ||
iptip.InputInterfaceMask != emptyInterface ||
iptip.Flags != 0 ||
iptip.InverseFlags&^inverseMask != 0
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index 3863293c7..1b4e0ad79 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
// Support only for OUTPUT chain.
// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
if hook != stack.Output {
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 84abe8d29..8ebdaff18 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,10 +15,257 @@
package netfilter
import (
+ "errors"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/usermem"
)
+// errorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const errorTargetName = "ERROR"
+
+// redirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port/destination IP for packets.
+const redirectTargetName = "REDIRECT"
+
+func marshalTarget(target stack.Target) []byte {
+ switch tg := target.(type) {
+ case stack.AcceptTarget:
+ return marshalStandardTarget(stack.RuleAccept)
+ case stack.DropTarget:
+ return marshalStandardTarget(stack.RuleDrop)
+ case stack.ErrorTarget:
+ return marshalErrorTarget(errorTargetName)
+ case stack.UserChainTarget:
+ return marshalErrorTarget(tg.Name)
+ case stack.ReturnTarget:
+ return marshalStandardTarget(stack.RuleReturn)
+ case stack.RedirectTarget:
+ return marshalRedirectTarget(tg)
+ case JumpTarget:
+ return marshalJumpTarget(tg)
+ default:
+ panic(fmt.Errorf("unknown target of type %T", target))
+ }
+}
+
+func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
+ nflog("convert to binary: marshalling standard target")
+
+ // The target's name will be the empty string.
+ target := linux.XTStandardTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTStandardTarget,
+ },
+ Verdict: translateFromStandardVerdict(verdict),
+ }
+
+ ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+func marshalErrorTarget(errorName string) []byte {
+ // This is an error target named error
+ target := linux.XTErrorTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTErrorTarget,
+ },
+ }
+ copy(target.Name[:], errorName)
+ copy(target.Target.Name[:], errorTargetName)
+
+ ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
+ // This is a redirect target named redirect
+ target := linux.XTRedirectTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTRedirectTarget,
+ },
+ }
+ copy(target.Target.Name[:], redirectTargetName)
+
+ ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
+ target.NfRange.RangeSize = 1
+ if rt.RangeProtoSpecified {
+ target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+ }
+ // Convert port from little endian to big endian.
+ port := make([]byte, 2)
+ binary.LittleEndian.PutUint16(port, rt.MinPort)
+ target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
+ binary.LittleEndian.PutUint16(port, rt.MaxPort)
+ target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+func marshalJumpTarget(jt JumpTarget) []byte {
+ nflog("convert to binary: marshalling jump target")
+
+ // The target's name will be the empty string.
+ target := linux.XTStandardTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTStandardTarget,
+ },
+ // Verdict is overloaded by the ABI. When positive, it holds
+ // the jump offset from the start of the table.
+ Verdict: int32(jt.Offset),
+ }
+
+ ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateFromStandardVerdict translates verdicts the same way as the iptables
+// tool.
+func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
+ switch verdict {
+ case stack.RuleAccept:
+ return -linux.NF_ACCEPT - 1
+ case stack.RuleDrop:
+ return -linux.NF_DROP - 1
+ case stack.RuleReturn:
+ return linux.NF_RETURN
+ default:
+ // TODO(gvisor.dev/issue/170): Support Jump.
+ panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+ }
+}
+
+// translateToStandardTarget translates from the value in a
+// linux.XTStandardTarget to an stack.Verdict.
+func translateToStandardTarget(val int32) (stack.Target, error) {
+ // TODO(gvisor.dev/issue/170): Support other verdicts.
+ switch val {
+ case -linux.NF_ACCEPT - 1:
+ return stack.AcceptTarget{}, nil
+ case -linux.NF_DROP - 1:
+ return stack.DropTarget{}, nil
+ case -linux.NF_QUEUE - 1:
+ return nil, errors.New("unsupported iptables verdict QUEUE")
+ case linux.NF_RETURN:
+ return stack.ReturnTarget{}, nil
+ default:
+ return nil, fmt.Errorf("unknown iptables verdict %d", val)
+ }
+}
+
+// parseTarget parses a target from optVal. optVal should contain only the
+// target.
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
+ nflog("set entries: parsing target of size %d", len(optVal))
+ if len(optVal) < linux.SizeOfXTEntryTarget {
+ return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
+ }
+ var target linux.XTEntryTarget
+ buf := optVal[:linux.SizeOfXTEntryTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &target)
+ switch target.Name.String() {
+ case "":
+ // Standard target.
+ if len(optVal) != linux.SizeOfXTStandardTarget {
+ return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
+ }
+ var standardTarget linux.XTStandardTarget
+ buf = optVal[:linux.SizeOfXTStandardTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+ if standardTarget.Verdict < 0 {
+ // A Verdict < 0 indicates a non-jump verdict.
+ return translateToStandardTarget(standardTarget.Verdict)
+ }
+ // A verdict >= 0 indicates a jump.
+ return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
+
+ case errorTargetName:
+ // Error target.
+ if len(optVal) != linux.SizeOfXTErrorTarget {
+ return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
+ }
+ var errorTarget linux.XTErrorTarget
+ buf = optVal[:linux.SizeOfXTErrorTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+ // Error targets are used in 2 cases:
+ // * An actual error case. These rules have an error
+ // named errorTargetName. The last entry of the table
+ // is usually an error case to catch any packets that
+ // somehow fall through every rule.
+ // * To mark the start of a user defined chain. These
+ // rules have an error with the name of the chain.
+ switch name := errorTarget.Name.String(); name {
+ case errorTargetName:
+ nflog("set entries: error target")
+ return stack.ErrorTarget{}, nil
+ default:
+ // User defined chain.
+ nflog("set entries: user-defined target %q", name)
+ return stack.UserChainTarget{Name: name}, nil
+ }
+
+ case redirectTargetName:
+ // Redirect target.
+ if len(optVal) < linux.SizeOfXTRedirectTarget {
+ return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
+ }
+
+ if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
+ return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+ }
+
+ var redirectTarget linux.XTRedirectTarget
+ buf = optVal[:linux.SizeOfXTRedirectTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+ // Copy linux.XTRedirectTarget to stack.RedirectTarget.
+ var target stack.RedirectTarget
+ nfRange := redirectTarget.NfRange
+
+ // RangeSize should be 1.
+ if nfRange.RangeSize != 1 {
+ return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+ }
+
+ // TODO(gvisor.dev/issue/170): Check if the flags are valid.
+ // Also check if we need to map ports or IP.
+ // For now, redirect target only supports destination port change.
+ // Port range and IP range are not supported yet.
+ if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
+ return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+ }
+ target.RangeProtoSpecified = true
+
+ target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+ target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
+
+ // TODO(gvisor.dev/issue/170): Port range is not supported yet.
+ if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+ return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+ }
+
+ // Convert port from big endian to little endian.
+ port := make([]byte, 2)
+ binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
+ target.MinPort = binary.LittleEndian.Uint16(port)
+
+ binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
+ target.MaxPort = binary.LittleEndian.Uint16(port)
+ return target, nil
+ }
+
+ // Unknown target.
+ return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet", target.Name.String())
+}
+
// JumpTarget implements stack.Target.
type JumpTarget struct {
// Offset is the byte offset of the rule to jump to. It is used for
@@ -30,6 +277,6 @@ type JumpTarget struct {
}
// Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
return stack.RuleJump, jt.RuleNum
}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 57a1e1c12..0bfd6c1f4 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -96,8 +96,8 @@ func (*TCPMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader)
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
if netHeader.TransportProtocol() != header.TCPProtocolNumber {
return false, false
@@ -111,36 +111,10 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
return false, false
}
- // Now we need the transport header. However, this may not have been set
- // yet.
- // TODO(gvisor.dev/issue/170): Parsing the transport header should
- // ultimately be moved into the stack.Check codepath as matchers are
- // added.
- var tcpHeader header.TCP
- if pkt.TransportHeader != nil {
- tcpHeader = header.TCP(pkt.TransportHeader)
- } else {
- var length int
- if hook == stack.Prerouting {
- // The network header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
- if !ok {
- // There's no valid TCP header here, so we hotdrop the
- // packet.
- return false, true
- }
- h := header.IPv4(hdr)
- pkt.NetworkHeader = hdr
- length = int(h.HeaderLength())
- }
- // The TCP header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize)
- if !ok {
- // There's no valid TCP header here, so we hotdrop the
- // packet.
- return false, true
- }
- tcpHeader = header.TCP(hdr[length:])
+ tcpHeader := header.TCP(pkt.TransportHeader().View())
+ if len(tcpHeader) < header.TCPMinimumSize {
+ // There's no valid TCP header here, so we drop the packet immediately.
+ return false, true
}
// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index cfa9e621d..7ed05461d 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -93,8 +93,8 @@ func (*UDPMatcher) Name() string {
}
// Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader)
+func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
// into the stack.Check codepath as matchers are added.
@@ -110,36 +110,10 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
return false, false
}
- // Now we need the transport header. However, this may not have been set
- // yet.
- // TODO(gvisor.dev/issue/170): Parsing the transport header should
- // ultimately be moved into the stack.Check codepath as matchers are
- // added.
- var udpHeader header.UDP
- if pkt.TransportHeader != nil {
- udpHeader = header.UDP(pkt.TransportHeader)
- } else {
- var length int
- if hook == stack.Prerouting {
- // The network header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
- if !ok {
- // There's no valid UDP header here, so we hotdrop the
- // packet.
- return false, true
- }
- h := header.IPv4(hdr)
- pkt.NetworkHeader = hdr
- length = int(h.HeaderLength())
- }
- // The UDP header hasn't been parsed yet. We have to do it here.
- hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize)
- if !ok {
- // There's no valid UDP header here, so we hotdrop the
- // packet.
- return false, true
- }
- udpHeader = header.UDP(hdr[length:])
+ udpHeader := header.UDP(pkt.TransportHeader().View())
+ if len(udpHeader) < header.UDPMinimumSize {
+ // There's no valid UDP header here, so we drop the packet immediately.
+ return false, true
}
// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 7212d8644..0546801bf 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -20,6 +20,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
@@ -35,6 +36,8 @@ go_library(
"//pkg/tcpip",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 0d45e5053..31e374833 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int
}
d := socket.NewDirent(t, netlinkSocketDevice)
- defer d.DecRef()
+ defer d.DecRef(t)
return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil
}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 81f34c5a2..68a9b9a96 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -38,6 +38,8 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
const sizeOfInt32 int = 4
@@ -138,14 +140,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke
// Bind the endpoint for good measure so we can connect to it. The
// bound address will never be exposed.
if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
// Create a connection from which the kernel can write messages.
connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
@@ -162,9 +164,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
- s.connection.Release()
- s.ep.Close()
+func (s *socketOpsCommon) Release(ctx context.Context) {
+ s.connection.Release(ctx)
+ s.ep.Close(ctx)
if s.bound {
s.ports.Release(s.protocol.Protocol(), s.portID)
@@ -330,7 +332,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
switch name {
@@ -340,24 +342,26 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr
}
s.mu.Lock()
defer s.mu.Unlock()
- return int32(s.sendBufferSize), nil
+ sendBufferSizeP := primitive.Int32(s.sendBufferSize)
+ return &sendBufferSizeP, nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
// We don't have limit on receiving size.
- return int32(math.MaxInt32), nil
+ recvBufferSizeP := primitive.Int32(math.MaxInt32)
+ return &recvBufferSizeP, nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- var passcred int32
+ var passcred primitive.Int32
if s.Passcred() {
passcred = 1
}
- return passcred, nil
+ return &passcred, nil
default:
socket.GetSockOptEmitUnimplementedEvent(t, name)
@@ -617,7 +621,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys
if len(bufs) > 0 {
// RecvMsg never receives the address, so we don't need to send
// one.
- _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{})
+ _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{})
// If the buffer is full, we simply drop messages, just like
// Linux.
if err != nil && err != syserr.ErrWouldBlock {
@@ -644,7 +648,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys
// Add the dump_done_errno payload.
m.Put(int64(0))
- _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
+ _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
if err != nil && err != syserr.ErrWouldBlock {
return err
}
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index b854bf990..a38d25da9 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -18,6 +18,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -40,6 +41,7 @@ type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
socketOpsCommon
}
@@ -55,18 +57,18 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
// Bind the endpoint for good measure so we can connect to it. The
// bound address will never be exposed.
if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
// Create a connection from which the kernel can write messages.
connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
- return &SocketVFS2{
+ fd := &SocketVFS2{
socketOpsCommon: socketOpsCommon{
ports: t.Kernel().NetlinkPorts(),
protocol: protocol,
@@ -75,7 +77,9 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
connection: connection,
sendBufferSize: defaultSendBufferSize,
},
- }, nil
+ }
+ fd.LockFD.Init(&vfs.FileLocks{})
+ return fd, nil
}
// Readiness implements waiter.Waitable.Readiness.
@@ -136,3 +140,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
return int64(n), err.ToError()
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 333e0042e..1fb777a6c 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -28,6 +28,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
@@ -50,5 +51,8 @@ go_library(
"//pkg/tcpip/transport/udp",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "//tools/go_marshal/primitive",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 60df51dae..e4846bc0b 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -26,6 +26,7 @@ package netstack
import (
"bytes"
+ "fmt"
"io"
"math"
"reflect"
@@ -33,6 +34,7 @@ import (
"syscall"
"time"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/binary"
@@ -60,6 +62,8 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
func mustCreateMetric(name, description string) *tcpip.StatCounter {
@@ -190,6 +194,8 @@ var Metrics = tcpip.Stats{
MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
+ ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
+ InvalidSourceAddress: mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
},
}
@@ -294,8 +300,9 @@ type socketOpsCommon struct {
readView buffer.View
// readCM holds control message information for the last packet read
// from Endpoint.
- readCM tcpip.ControlMessages
- sender tcpip.FullAddress
+ readCM tcpip.ControlMessages
+ sender tcpip.FullAddress
+ linkPacketInfo tcpip.LinkPacketInfo
// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
// of returned messages can be returned via control messages. When
@@ -323,7 +330,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue
}
dirent := socket.NewDirent(t, netstackDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(t)
return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
socketOpsCommon: socketOpsCommon{
Queue: queue,
@@ -416,7 +423,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
- // TODO(b/129292371): Return protocol too.
+ // TODO(gvisor.dev/issue/173): Return protocol too.
return tcpip.FullAddress{
NIC: tcpip.NICID(a.InterfaceIndex),
Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
@@ -444,8 +451,21 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
}
s.readView = nil
s.sender = tcpip.FullAddress{}
+ s.linkPacketInfo = tcpip.LinkPacketInfo{}
- v, cms, err := s.Endpoint.Read(&s.sender)
+ var v buffer.View
+ var cms tcpip.ControlMessages
+ var err *tcpip.Error
+
+ switch e := s.Endpoint.(type) {
+ // The ordering of these interfaces matters. The most specific
+ // interfaces must be specified before the more generic Endpoint
+ // interface.
+ case tcpip.PacketEndpoint:
+ v, cms, err = e.ReadPacket(&s.sender, &s.linkPacketInfo)
+ case tcpip.Endpoint:
+ v, cms, err = e.Read(&s.sender)
+ }
if err != nil {
atomic.StoreUint32(&s.readViewHasData, 0)
return syserr.TranslateNetstackError(err)
@@ -459,7 +479,7 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
}
// Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
+func (s *socketOpsCommon) Release(context.Context) {
s.Endpoint.Close()
}
@@ -719,6 +739,14 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
defer s.EventUnregister(&e)
if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+ if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
+ // TCP unlike UDP returns EADDRNOTAVAIL when it can't
+ // find an available local ephemeral port.
+ if err == tcpip.ErrNoPortAvailable {
+ return syserr.ErrAddressNotAvailable
+ }
+ }
+
return syserr.TranslateNetstackError(err)
}
@@ -826,7 +854,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
flags := ns.Flags()
@@ -884,7 +912,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketOperations rather than
// commonEndpoint. commonEndpoint should be extended to support socket
@@ -894,25 +922,25 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptTimestamp {
val = 1
}
- return val, nil
+ return &val, nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptInq {
val = 1
}
- return val, nil
+ return &val, nil
}
if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
@@ -930,7 +958,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
if err != nil {
return nil, err
}
- return info, nil
+ return &info, nil
case linux.IPT_SO_GET_ENTRIES:
if outLen < linux.SizeOfIPTGetEntries {
@@ -945,7 +973,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
if err != nil {
return nil, err
}
- return entries, nil
+ return &entries, nil
}
}
@@ -955,7 +983,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
// sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -988,7 +1016,7 @@ func boolToInt32(v bool) int32 {
}
// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
switch name {
case linux.SO_ERROR:
@@ -999,9 +1027,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
// Get the last error and convert it.
err := ep.GetSockOpt(tcpip.ErrorOption{})
if err == nil {
- return int32(0), nil
+ optP := primitive.Int32(0)
+ return &optP, nil
}
- return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
+
+ optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number())
+ return &optP, nil
case linux.SO_PEERCRED:
if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
@@ -1009,11 +1040,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
}
tcred := t.Credentials()
- return syscall.Ucred{
- Pid: int32(t.ThreadGroup().ID()),
- Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
- Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
- }, nil
+ creds := linux.ControlMessageCredentials{
+ PID: int32(t.ThreadGroup().ID()),
+ UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+ GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+ }
+ return &creds, nil
case linux.SO_PASSCRED:
if outLen < sizeOfInt32 {
@@ -1024,7 +1056,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_SNDBUF:
if outLen < sizeOfInt32 {
@@ -1040,7 +1074,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
size = math.MaxInt32
}
- return int32(size), nil
+ sizeP := primitive.Int32(size)
+ return &sizeP, nil
case linux.SO_RCVBUF:
if outLen < sizeOfInt32 {
@@ -1056,7 +1091,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
size = math.MaxInt32
}
- return int32(size), nil
+ sizeP := primitive.Int32(size)
+ return &sizeP, nil
case linux.SO_REUSEADDR:
if outLen < sizeOfInt32 {
@@ -1067,7 +1103,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_REUSEPORT:
if outLen < sizeOfInt32 {
@@ -1078,7 +1115,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_BINDTODEVICE:
var v tcpip.BindToDeviceOption
@@ -1086,7 +1125,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.TranslateNetstackError(err)
}
if v == 0 {
- return []byte{}, nil
+ var b primitive.ByteSlice
+ return &b, nil
}
if outLen < linux.IFNAMSIZ {
return nil, syserr.ErrInvalidArgument
@@ -1101,7 +1141,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
// interface was removed.
return nil, syserr.ErrUnknownDevice
}
- return append([]byte(nic.Name), 0), nil
+
+ name := primitive.ByteSlice(append([]byte(nic.Name), 0))
+ return &name, nil
case linux.SO_BROADCAST:
if outLen < sizeOfInt32 {
@@ -1112,7 +1154,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_KEEPALIVE:
if outLen < sizeOfInt32 {
@@ -1123,13 +1167,17 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.SO_LINGER:
if outLen < linux.SizeOfLinger {
return nil, syserr.ErrInvalidArgument
}
- return linux.Linger{}, nil
+
+ linger := linux.Linger{}
+ return &linger, nil
case linux.SO_SNDTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1137,7 +1185,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.ErrInvalidArgument
}
- return linux.NsecToTimeval(s.SendTimeout()), nil
+ sendTimeout := linux.NsecToTimeval(s.SendTimeout())
+ return &sendTimeout, nil
case linux.SO_RCVTIMEO:
// TODO(igudger): Linux allows shorter lengths for partial results.
@@ -1145,7 +1194,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.ErrInvalidArgument
}
- return linux.NsecToTimeval(s.RecvTimeout()), nil
+ recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
+ return &recvTimeout, nil
case linux.SO_OOBINLINE:
if outLen < sizeOfInt32 {
@@ -1157,7 +1207,20 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
return nil, syserr.TranslateNetstackError(err)
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
+
+ case linux.SO_NO_CHECK:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptBool(tcpip.NoChecksumOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
default:
socket.GetSockOptEmitUnimplementedEvent(t, name)
@@ -1166,7 +1229,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
}
// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
-func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.TCP_NODELAY:
if outLen < sizeOfInt32 {
@@ -1177,7 +1240,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(!v), nil
+
+ vP := primitive.Int32(boolToInt32(!v))
+ return &vP, nil
case linux.TCP_CORK:
if outLen < sizeOfInt32 {
@@ -1188,7 +1253,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.TCP_QUICKACK:
if outLen < sizeOfInt32 {
@@ -1199,7 +1266,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.TCP_MAXSEG:
if outLen < sizeOfInt32 {
@@ -1210,8 +1279,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_KEEPIDLE:
if outLen < sizeOfInt32 {
@@ -1222,8 +1291,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(time.Duration(v) / time.Second), nil
+ keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
+ return &keepAliveIdle, nil
case linux.TCP_KEEPINTVL:
if outLen < sizeOfInt32 {
@@ -1234,8 +1303,20 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
+ keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
+ return &keepAliveInterval, nil
- return int32(time.Duration(v) / time.Second), nil
+ case linux.TCP_KEEPCNT:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_USER_TIMEOUT:
if outLen < sizeOfInt32 {
@@ -1246,8 +1327,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(time.Duration(v) / time.Millisecond), nil
+ tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
+ return &tcpUserTimeout, nil
case linux.TCP_INFO:
var v tcpip.TCPInfoOption
@@ -1260,12 +1341,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
info := linux.TCPInfo{}
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, &info)
- if len(ib) > outLen {
- ib = ib[:outLen]
+ buf := t.CopyScratchBuffer(info.SizeBytes())
+ info.MarshalUnsafe(buf)
+ if len(buf) > outLen {
+ buf = buf[:outLen]
}
-
- return ib, nil
+ bufP := primitive.ByteSlice(buf)
+ return &bufP, nil
case linux.TCP_CC_INFO,
linux.TCP_NOTSENT_LOWAT,
@@ -1295,7 +1377,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
}
b := make([]byte, toCopy)
copy(b, v)
- return b, nil
+
+ bP := primitive.ByteSlice(b)
+ return &bP, nil
case linux.TCP_LINGER2:
if outLen < sizeOfInt32 {
@@ -1307,7 +1391,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return nil, syserr.TranslateNetstackError(err)
}
- return int32(time.Duration(v) / time.Second), nil
+ lingerTimeout := primitive.Int32(time.Duration(v) / time.Second)
+ return &lingerTimeout, nil
case linux.TCP_DEFER_ACCEPT:
if outLen < sizeOfInt32 {
@@ -1319,7 +1404,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return nil, syserr.TranslateNetstackError(err)
}
- return int32(time.Duration(v) / time.Second), nil
+ tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
+ return &tcpDeferAccept, nil
case linux.TCP_SYNCNT:
if outLen < sizeOfInt32 {
@@ -1330,8 +1416,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.TCP_WINDOW_CLAMP:
if outLen < sizeOfInt32 {
@@ -1342,8 +1428,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
default:
emitUnimplementedEventTCP(t, name)
}
@@ -1351,7 +1437,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
}
// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.IPV6_V6ONLY:
if outLen < sizeOfInt32 {
@@ -1362,7 +1448,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IPV6_PATHMTU:
t.Kernel().EmitUnimplementedEvent(t)
@@ -1370,21 +1458,24 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
case linux.IPV6_TCLASS:
// Length handling for parity with Linux.
if outLen == 0 {
- return make([]byte, 0), nil
+ var b primitive.ByteSlice
+ return &b, nil
}
v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- uintv := uint32(v)
+ uintv := primitive.Uint32(v)
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, &uintv)
+ ib := t.CopyScratchBuffer(uintv.SizeBytes())
+ uintv.MarshalUnsafe(ib)
// Handle cases where outLen is lesser than sizeOfInt32.
if len(ib) > outLen {
ib = ib[:outLen]
}
- return ib, nil
+ ibP := primitive.ByteSlice(ib)
+ return &ibP, nil
case linux.IPV6_RECVTCLASS:
if outLen < sizeOfInt32 {
@@ -1395,7 +1486,13 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
+
+ case linux.SO_ORIGINAL_DST:
+ // TODO(gvisor.dev/issue/170): ip6tables.
+ return nil, syserr.ErrInvalidArgument
default:
emitUnimplementedEventIPv6(t, name)
@@ -1404,7 +1501,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
}
// getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
switch name {
case linux.IP_TTL:
if outLen < sizeOfInt32 {
@@ -1417,11 +1514,12 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
}
// Fill in the default value, if needed.
- if v == 0 {
- v = DefaultTTL
+ vP := primitive.Int32(v)
+ if vP == 0 {
+ vP = DefaultTTL
}
- return int32(v), nil
+ return &vP, nil
case linux.IP_MULTICAST_TTL:
if outLen < sizeOfInt32 {
@@ -1433,7 +1531,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
return nil, syserr.TranslateNetstackError(err)
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.IP_MULTICAST_IF:
if outLen < len(linux.InetAddr{}) {
@@ -1447,7 +1546,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
- return a.(*linux.SockAddrInet).Addr, nil
+ return &a.(*linux.SockAddrInet).Addr, nil
case linux.IP_MULTICAST_LOOP:
if outLen < sizeOfInt32 {
@@ -1458,21 +1557,26 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IP_TOS:
// Length handling for parity with Linux.
if outLen == 0 {
- return []byte(nil), nil
+ var b primitive.ByteSlice
+ return &b, nil
}
v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
if outLen < sizeOfInt32 {
- return uint8(v), nil
+ vP := primitive.Uint8(v)
+ return &vP, nil
}
- return int32(v), nil
+ vP := primitive.Int32(v)
+ return &vP, nil
case linux.IP_RECVTOS:
if outLen < sizeOfInt32 {
@@ -1483,7 +1587,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
case linux.IP_PKTINFO:
if outLen < sizeOfInt32 {
@@ -1494,7 +1600,22 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- return boolToInt32(v), nil
+
+ vP := primitive.Int32(boolToInt32(v))
+ return &vP, nil
+
+ case linux.SO_ORIGINAL_DST:
+ if outLen < int(binary.Size(linux.SockAddrInet{})) {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.OriginalDestinationOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
+ return a.(*linux.SockAddrInet), nil
default:
emitUnimplementedEventIP(t, name)
@@ -1698,6 +1819,14 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+ case linux.SO_NO_CHECK:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+
+ v := usermem.ByteOrder.Uint32(optVal)
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0))
+
case linux.SO_LINGER:
if len(optVal) < linux.SizeOfLinger {
return syserr.ErrInvalidArgument
@@ -1712,6 +1841,11 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
return nil
+ case linux.SO_DETACH_FILTER:
+ // optval is ignored.
+ var v tcpip.SocketDetachFilterOption
+ return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+
default:
socket.SetSockOptEmitUnimplementedEvent(t, name)
}
@@ -1777,6 +1911,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+ case linux.TCP_KEEPCNT:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+
+ v := usermem.ByteOrder.Uint32(optVal)
+ if v < 1 || v > linux.MAX_TCP_KEEPCNT {
+ return syserr.ErrInvalidArgument
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
+
case linux.TCP_USER_TIMEOUT:
if len(optVal) < sizeOfInt32 {
return syserr.ErrInvalidArgument
@@ -2060,13 +2205,22 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
}
return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+ case linux.IP_HDRINCL:
+ if len(optVal) == 0 {
+ return nil
+ }
+ v, err := parseIntOrChar(optVal)
+ if err != nil {
+ return err
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
+
case linux.IP_ADD_SOURCE_MEMBERSHIP,
linux.IP_BIND_ADDRESS_NO_PORT,
linux.IP_BLOCK_SOURCE,
linux.IP_CHECKSUM,
linux.IP_DROP_SOURCE_MEMBERSHIP,
linux.IP_FREEBIND,
- linux.IP_HDRINCL,
linux.IP_IPSEC_POLICY,
linux.IP_MINTTL,
linux.IP_MSFILTER,
@@ -2106,30 +2260,20 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) {
switch name {
case linux.TCP_CONGESTION,
linux.TCP_CORK,
- linux.TCP_DEFER_ACCEPT,
linux.TCP_FASTOPEN,
linux.TCP_FASTOPEN_CONNECT,
linux.TCP_FASTOPEN_KEY,
linux.TCP_FASTOPEN_NO_COOKIE,
- linux.TCP_KEEPCNT,
- linux.TCP_KEEPIDLE,
- linux.TCP_KEEPINTVL,
- linux.TCP_LINGER2,
- linux.TCP_MAXSEG,
linux.TCP_QUEUE_SEQ,
- linux.TCP_QUICKACK,
linux.TCP_REPAIR,
linux.TCP_REPAIR_QUEUE,
linux.TCP_REPAIR_WINDOW,
linux.TCP_SAVED_SYN,
linux.TCP_SAVE_SYN,
- linux.TCP_SYNCNT,
linux.TCP_THIN_DUPACK,
linux.TCP_THIN_LINEAR_TIMEOUTS,
linux.TCP_TIMESTAMP,
- linux.TCP_ULP,
- linux.TCP_USER_TIMEOUT,
- linux.TCP_WINDOW_CLAMP:
+ linux.TCP_ULP:
t.Kernel().EmitUnimplementedEvent(t)
}
@@ -2291,7 +2435,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
return &out, uint32(sockAddrInet6Size)
case linux.AF_PACKET:
- // TODO(b/129292371): Return protocol too.
+ // TODO(gvisor.dev/issue/173): Return protocol too.
var out linux.SockAddrLink
out.Family = linux.AF_PACKET
out.InterfaceIndex = int32(addr.NIC)
@@ -2397,6 +2541,23 @@ func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
}
+func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
+ switch pktType {
+ case tcpip.PacketHost:
+ return linux.PACKET_HOST
+ case tcpip.PacketOtherHost:
+ return linux.PACKET_OTHERHOST
+ case tcpip.PacketOutgoing:
+ return linux.PACKET_OUTGOING
+ case tcpip.PacketBroadcast:
+ return linux.PACKET_BROADCAST
+ case tcpip.PacketMulticast:
+ return linux.PACKET_MULTICAST
+ default:
+ panic(fmt.Sprintf("unknown packet type: %d", pktType))
+ }
+}
+
// nonBlockingRead issues a non-blocking read.
//
// TODO(b/78348848): Support timestamps for stream sockets.
@@ -2452,6 +2613,11 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq
var addrLen uint32
if isPacket && senderRequested {
addr, addrLen = ConvertAddress(s.family, s.sender)
+ switch v := addr.(type) {
+ case *linux.SockAddrLink:
+ v.Protocol = htons(uint16(s.linkPacketInfo.Protocol))
+ v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType)
+ }
}
if peek {
@@ -2686,11 +2852,16 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
}
func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("ioctl(2) may only be called from a task goroutine")
+ }
+
// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
// sockets.
// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
switch args[1].Int() {
- case syscall.SIOCGSTAMP:
+ case linux.SIOCGSTAMP:
s.readMu.Lock()
defer s.readMu.Unlock()
if !s.timestampValid {
@@ -2698,9 +2869,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
}
tv := linux.NsecToTimeval(s.timestampNS)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := tv.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCINQ:
@@ -2719,9 +2888,8 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
}
@@ -2730,52 +2898,49 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
// Ioctl performs a socket ioctl.
func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic("ioctl(2) may only be called from a task goroutine")
+ }
+
switch arg := int(args[1].Int()); arg {
- case syscall.SIOCGIFFLAGS,
- syscall.SIOCGIFADDR,
- syscall.SIOCGIFBRDADDR,
- syscall.SIOCGIFDSTADDR,
- syscall.SIOCGIFHWADDR,
- syscall.SIOCGIFINDEX,
- syscall.SIOCGIFMAP,
- syscall.SIOCGIFMETRIC,
- syscall.SIOCGIFMTU,
- syscall.SIOCGIFNAME,
- syscall.SIOCGIFNETMASK,
- syscall.SIOCGIFTXQLEN:
+ case linux.SIOCGIFFLAGS,
+ linux.SIOCGIFADDR,
+ linux.SIOCGIFBRDADDR,
+ linux.SIOCGIFDSTADDR,
+ linux.SIOCGIFHWADDR,
+ linux.SIOCGIFINDEX,
+ linux.SIOCGIFMAP,
+ linux.SIOCGIFMETRIC,
+ linux.SIOCGIFMTU,
+ linux.SIOCGIFNAME,
+ linux.SIOCGIFNETMASK,
+ linux.SIOCGIFTXQLEN,
+ linux.SIOCETHTOOL:
var ifr linux.IFReq
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
return 0, err
}
if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
return 0, err.ToError()
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := ifr.CopyOut(t, args[2].Pointer())
return 0, err
- case syscall.SIOCGIFCONF:
+ case linux.SIOCGIFCONF:
// Return a list of interface addresses or the buffer size
// necessary to hold the list.
var ifc linux.IFConf
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
return 0, err
}
- if err := ifconfIoctl(ctx, io, &ifc); err != nil {
+ if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
return 0, err
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
- AddressSpaceActive: true,
- })
-
+ _, err := ifc.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCINQ:
@@ -2788,9 +2953,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
v = math.MaxInt32
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCOUTQ:
@@ -2804,9 +2968,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
}
// Copy result to userspace.
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ vP := primitive.Int32(v)
+ _, err := vP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
@@ -2832,7 +2995,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
// identify a device.
- if arg == syscall.SIOCGIFNAME {
+ if arg == linux.SIOCGIFNAME {
// Gets the name of the interface given the interface index
// stored in ifr_ifindex.
index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4]))
@@ -2855,21 +3018,28 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
}
switch arg {
- case syscall.SIOCGIFINDEX:
+ case linux.SIOCGIFINDEX:
// Copy out the index to the data.
usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
- case syscall.SIOCGIFHWADDR:
+ case linux.SIOCGIFHWADDR:
// Copy the hardware address out.
- ifr.Data[0] = 6 // IEEE802.2 arp type.
- ifr.Data[1] = 0
+ //
+ // Refer: https://linux.die.net/man/7/netdevice
+ // SIOCGIFHWADDR, SIOCSIFHWADDR
+ //
+ // Get or set the hardware address of a device using
+ // ifr_hwaddr. The hardware address is specified in a struct
+ // sockaddr. sa_family contains the ARPHRD_* device type,
+ // sa_data the L2 hardware address starting from byte 0. Setting
+ // the hardware address is a privileged operation.
+ usermem.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
n := copy(ifr.Data[2:], iface.Addr)
for i := 2 + n; i < len(ifr.Data); i++ {
ifr.Data[i] = 0 // Clear padding.
}
- usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
- case syscall.SIOCGIFFLAGS:
+ case linux.SIOCGIFFLAGS:
f, err := interfaceStatusFlags(stack, iface.Name)
if err != nil {
return err
@@ -2878,7 +3048,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
// matches Linux behavior.
usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
- case syscall.SIOCGIFADDR:
+ case linux.SIOCGIFADDR:
// Copy the IPv4 address out.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
@@ -2889,32 +3059,32 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
break
}
- case syscall.SIOCGIFMETRIC:
+ case linux.SIOCGIFMETRIC:
// Gets the metric of the device. As per netdevice(7), this
// always just sets ifr_metric to 0.
usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
- case syscall.SIOCGIFMTU:
+ case linux.SIOCGIFMTU:
// Gets the MTU of the device.
usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
- case syscall.SIOCGIFMAP:
+ case linux.SIOCGIFMAP:
// Gets the hardware parameters of the device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFTXQLEN:
+ case linux.SIOCGIFTXQLEN:
// Gets the transmit queue length of the device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFDSTADDR:
+ case linux.SIOCGIFDSTADDR:
// Gets the destination address of a point-to-point device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFBRDADDR:
+ case linux.SIOCGIFBRDADDR:
// Gets the broadcast address of a device.
// TODO(gvisor.dev/issue/505): Implement.
- case syscall.SIOCGIFNETMASK:
+ case linux.SIOCGIFNETMASK:
// Gets the network mask of a device.
for _, addr := range stack.InterfaceAddrs()[index] {
// This ioctl is only compatible with AF_INET addresses.
@@ -2931,6 +3101,14 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
break
}
+ case linux.SIOCETHTOOL:
+ // Stubbed out for now, Ideally we should implement the required
+ // sub-commands for ETHTOOL
+ //
+ // See:
+ // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
+ return syserr.ErrEndpointOperation
+
default:
// Not a valid call.
return syserr.ErrInvalidArgument
@@ -2940,7 +3118,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
}
// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
-func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error {
// If Ptr is NULL, return the necessary buffer size via Len.
// Otherwise, write up to Len bytes starting at Ptr containing ifreq
// structs.
@@ -2977,9 +3155,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
// Copy the ifr to userspace.
dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
ifc.Len += int32(linux.SizeOfIFReq)
- if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := ifr.CopyOut(t, usermem.Addr(dst)); err != nil {
return err
}
}
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index fcd8013c0..3335e7430 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -30,6 +31,8 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// SocketVFS2 encapsulates all the state needed to represent a network stack
@@ -38,6 +41,7 @@ type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
socketOpsCommon
}
@@ -64,6 +68,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
protocol: protocol,
},
}
+ s.LockFD.Init(&vfs.FileLocks{})
vfsfd := &s.vfsfd
if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
@@ -164,7 +169,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
return 0, nil, 0, syserr.FromError(err)
@@ -197,7 +202,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for netstack.SocketVFS2 rather than
// commonEndpoint. commonEndpoint should be extended to support socket
@@ -207,25 +212,25 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptTimestamp {
val = 1
}
- return val, nil
+ return &val, nil
}
if level == linux.SOL_TCP && name == linux.TCP_INQ {
if outLen < sizeOfInt32 {
return nil, syserr.ErrInvalidArgument
}
- val := int32(0)
+ val := primitive.Int32(0)
s.readMu.Lock()
defer s.readMu.Unlock()
if s.sockOptInq {
val = 1
}
- return val, nil
+ return &val, nil
}
if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
@@ -243,7 +248,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
if err != nil {
return nil, err
}
- return info, nil
+ return &info, nil
case linux.IPT_SO_GET_ENTRIES:
if outLen < linux.SizeOfIPTGetEntries {
@@ -258,7 +263,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
if err != nil {
return nil, err
}
- return entries, nil
+ return &entries, nil
}
}
@@ -315,3 +320,13 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f5fa18136..f0fe18684 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -15,10 +15,11 @@
package netstack
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/inet"
- "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -41,19 +42,29 @@ func (s *Stack) SupportsIPv6() bool {
return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber)
}
+// Converts Netstack's ARPHardwareType to equivalent linux constants.
+func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 {
+ switch t {
+ case header.ARPHardwareNone:
+ return linux.ARPHRD_NONE
+ case header.ARPHardwareLoopback:
+ return linux.ARPHRD_LOOPBACK
+ case header.ARPHardwareEther:
+ return linux.ARPHRD_ETHER
+ default:
+ panic(fmt.Sprintf("unknown ARPHRD type: %d", t))
+ }
+}
+
// Interfaces implements inet.Stack.Interfaces.
func (s *Stack) Interfaces() map[int32]inet.Interface {
is := make(map[int32]inet.Interface)
for id, ni := range s.Stack.NICInfo() {
- var devType uint16
- if ni.Flags.Loopback {
- devType = linux.ARPHRD_LOOPBACK
- }
is[int32(id)] = inet.Interface{
Name: ni.Name,
Addr: []byte(ni.LinkAddress),
Flags: uint32(nicStateFlagsToLinux(ni.Flags)),
- DeviceType: devType,
+ DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType),
MTU: ni.MTU,
}
}
@@ -196,6 +207,20 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
}
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ var recovery tcp.Recovery
+ if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
+ return 0, syserr.TranslateNetstackError(err).ToError()
+ }
+ return inet.TCPLossRecovery(recovery), nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+}
+
// Statistics implements inet.Stack.Statistics.
func (s *Stack) Statistics(stat interface{}, arg string) error {
switch stats := stat.(type) {
@@ -314,7 +339,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
udp.PacketsSent.Value(), // OutDatagrams.
udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
0, // Udp/SndbufErrors.
- 0, // Udp/InCsumErrors.
+ udp.ChecksumErrors.Value(), // Udp/InCsumErrors.
0, // Udp/IgnoredMulti.
}
default:
@@ -362,16 +387,10 @@ func (s *Stack) RouteTable() []inet.Route {
}
// IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (stack.IPTables, error) {
+func (s *Stack) IPTables() (*stack.IPTables, error) {
return s.Stack.IPTables(), nil
}
-// FillDefaultIPTables sets the stack's iptables to the default tables, which
-// allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() {
- netfilter.FillDefaultIPTables(s.Stack)
-}
-
// Resume implements inet.Stack.Resume.
func (s *Stack) Resume() {
s.Stack.Resume()
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 6580bd6e9..04b259d27 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -35,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// ControlMessages represents the union of unix control messages and tcpip
@@ -45,8 +46,8 @@ type ControlMessages struct {
}
// Release releases Unix domain socket credentials and rights.
-func (c *ControlMessages) Release() {
- c.Unix.Release()
+func (c *ControlMessages) Release(ctx context.Context) {
+ c.Unix.Release(ctx)
}
// Socket is an interface combining fs.FileOperations and SocketOps,
@@ -86,7 +87,7 @@ type SocketOps interface {
Shutdown(t *kernel.Task, how int) *syserr.Error
// GetSockOpt implements the getsockopt(2) linux syscall.
- GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error)
+ GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error)
// SetSockOpt implements the setsockopt(2) linux syscall.
SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
@@ -407,7 +408,6 @@ func emitUnimplementedEvent(t *kernel.Task, name int) {
linux.SO_MARK,
linux.SO_MAX_PACING_RATE,
linux.SO_NOFCS,
- linux.SO_NO_CHECK,
linux.SO_OOBINLINE,
linux.SO_PASSCRED,
linux.SO_PASSSEC,
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index de2cc4bdf..061a689a9 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/time",
@@ -34,5 +35,6 @@ go_library(
"//pkg/tcpip",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index ce5b94ee7..c67b602f0 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool {
// The socket will be a fresh state after a call to close and may be reused.
// That is, close may be used to "unbind" or "disconnect" the socket in error
// paths.
-func (e *connectionedEndpoint) Close() {
+func (e *connectionedEndpoint) Close(ctx context.Context) {
e.Lock()
var c ConnectedEndpoint
var r Receiver
@@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() {
case e.Listening():
close(e.acceptedChan)
for n := range e.acceptedChan {
- n.Close()
+ n.Close(ctx)
}
e.acceptedChan = nil
e.path = ""
@@ -241,18 +241,18 @@ func (e *connectionedEndpoint) Close() {
e.Unlock()
if c != nil {
c.CloseNotify()
- c.Release()
+ c.Release(ctx)
}
if r != nil {
r.CloseNotify()
- r.Release()
+ r.Release(ctx)
}
}
// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
if ce.Type() != e.stype {
- return syserr.ErrConnectionRefused
+ return syserr.ErrWrongProtocolForSocket
}
// Check if ce is e to avoid a deadlock.
@@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
return nil
default:
// Busy; return ECONNREFUSED per spec.
- ne.Close()
+ ne.Close(ctx)
e.Unlock()
ce.Unlock()
return syserr.ErrConnectionRefused
@@ -476,6 +476,9 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
// State implements socket.Socket.State.
func (e *connectionedEndpoint) State() uint32 {
+ e.Lock()
+ defer e.Unlock()
+
if e.Connected() {
return linux.SS_CONNECTED
}
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 4b06d63ac..70ee8f9b8 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool {
// Close puts the endpoint in a closed state and frees all resources associated
// with it.
-func (e *connectionlessEndpoint) Close() {
+func (e *connectionlessEndpoint) Close(ctx context.Context) {
e.Lock()
if e.connected != nil {
- e.connected.Release()
+ e.connected.Release(ctx)
e.connected = nil
}
@@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() {
e.Unlock()
r.CloseNotify()
- r.Release()
+ r.Release(ctx)
}
// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
@@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C
if err != nil {
return 0, syserr.ErrInvalidEndpointState
}
- defer connected.Release()
+ defer connected.Release(ctx)
e.Lock()
- n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+ n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
e.Unlock()
if notify {
@@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
e.Lock()
if e.connected != nil {
- e.connected.Release()
+ e.connected.Release(ctx)
}
e.connected = connected
e.Unlock()
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index d8f3ad63d..ef6043e19 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -15,6 +15,7 @@
package transport
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
@@ -57,10 +58,10 @@ func (q *queue) Close() {
// Both the read and write queues must be notified after resetting:
// q.ReaderQueue.Notify(waiter.EventIn)
// q.WriterQueue.Notify(waiter.EventOut)
-func (q *queue) Reset() {
+func (q *queue) Reset(ctx context.Context) {
q.mu.Lock()
for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
- cur.Release()
+ cur.Release(ctx)
}
q.dataList.Reset()
q.used = 0
@@ -68,8 +69,8 @@ func (q *queue) Reset() {
}
// DecRef implements RefCounter.DecRef with destructor q.Reset.
-func (q *queue) DecRef() {
- q.DecRefWithDestructor(q.Reset)
+func (q *queue) DecRef(ctx context.Context) {
+ q.DecRefWithDestructor(ctx, q.Reset)
// We don't need to notify after resetting because no one cares about
// this queue after all references have been dropped.
}
@@ -111,7 +112,7 @@ func (q *queue) IsWritable() bool {
//
// If notify is true, ReaderQueue.Notify must be called:
// q.ReaderQueue.Notify(waiter.EventIn)
-func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
+func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
q.mu.Lock()
if q.closed {
@@ -124,7 +125,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress
}
if discardEmpty && l == 0 {
q.mu.Unlock()
- c.Release()
+ c.Release(ctx)
return 0, false, nil
}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2f1b127df..475d7177e 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -37,7 +37,7 @@ type RightsControlMessage interface {
Clone() RightsControlMessage
// Release releases any resources owned by the RightsControlMessage.
- Release()
+ Release(ctx context.Context)
}
// A CredentialsControlMessage is a control message containing Unix credentials.
@@ -74,9 +74,9 @@ func (c *ControlMessages) Clone() ControlMessages {
}
// Release releases both the credentials and the rights.
-func (c *ControlMessages) Release() {
+func (c *ControlMessages) Release(ctx context.Context) {
if c.Rights != nil {
- c.Rights.Release()
+ c.Rights.Release(ctx)
}
*c = ControlMessages{}
}
@@ -90,7 +90,7 @@ type Endpoint interface {
// Close puts the endpoint in a closed state and frees all resources
// associated with it.
- Close()
+ Close(ctx context.Context)
// RecvMsg reads data and a control message from the endpoint. This method
// does not block if there is no data pending.
@@ -252,7 +252,7 @@ type BoundEndpoint interface {
// Release releases any resources held by the BoundEndpoint. It must be
// called before dropping all references to a BoundEndpoint returned by a
// function.
- Release()
+ Release(ctx context.Context)
}
// message represents a message passed over a Unix domain socket.
@@ -281,8 +281,8 @@ func (m *message) Length() int64 {
}
// Release releases any resources held by the message.
-func (m *message) Release() {
- m.Control.Release()
+func (m *message) Release(ctx context.Context) {
+ m.Control.Release(ctx)
}
// Peek returns a copy of the message.
@@ -304,7 +304,7 @@ type Receiver interface {
// See Endpoint.RecvMsg for documentation on shared arguments.
//
// notify indicates if RecvNotify should be called.
- Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
+ Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
// RecvNotify notifies the Receiver of a successful Recv. This must not be
// called while holding any endpoint locks.
@@ -333,7 +333,7 @@ type Receiver interface {
// Release releases any resources owned by the Receiver. It should be
// called before droping all references to a Receiver.
- Release()
+ Release(ctx context.Context)
}
// queueReceiver implements Receiver for datagram sockets.
@@ -344,7 +344,7 @@ type queueReceiver struct {
}
// Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
var m *message
var notify bool
var err *syserr.Error
@@ -398,8 +398,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
}
// Release implements Receiver.Release.
-func (q *queueReceiver) Release() {
- q.readQueue.DecRef()
+func (q *queueReceiver) Release(ctx context.Context) {
+ q.readQueue.DecRef(ctx)
}
// streamQueueReceiver implements Receiver for stream sockets.
@@ -456,7 +456,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
}
// Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -502,7 +502,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
var cmTruncated bool
if c.Rights != nil && numRights == 0 {
- c.Rights.Release()
+ c.Rights.Release(ctx)
c.Rights = nil
cmTruncated = true
}
@@ -557,7 +557,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int,
// Consume rights.
if numRights == 0 {
cmTruncated = true
- q.control.Rights.Release()
+ q.control.Rights.Release(ctx)
} else {
c.Rights = q.control.Rights
haveRights = true
@@ -582,7 +582,7 @@ type ConnectedEndpoint interface {
//
// syserr.ErrWouldBlock can be returned along with a partial write if
// the caller should block to send the rest of the data.
- Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
+ Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
// SendNotify notifies the ConnectedEndpoint of a successful Send. This
// must not be called while holding any endpoint locks.
@@ -616,7 +616,7 @@ type ConnectedEndpoint interface {
// Release releases any resources owned by the ConnectedEndpoint. It should
// be called before droping all references to a ConnectedEndpoint.
- Release()
+ Release(ctx context.Context)
// CloseUnread sets the fact that this end is closed with unread data to
// the peer socket.
@@ -654,7 +654,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
}
// Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
discardEmpty := false
truncate := false
if e.endpoint.Type() == linux.SOCK_STREAM {
@@ -669,7 +669,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu
truncate = true
}
- return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate)
+ return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
}
// SendNotify implements ConnectedEndpoint.SendNotify.
@@ -707,8 +707,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 {
}
// Release implements ConnectedEndpoint.Release.
-func (e *connectedEndpoint) Release() {
- e.writeQueue.DecRef()
+func (e *connectedEndpoint) Release(ctx context.Context) {
+ e.writeQueue.DecRef(ctx)
}
// CloseUnread implements ConnectedEndpoint.CloseUnread.
@@ -798,7 +798,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n
return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected
}
- recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
+ recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek)
e.Unlock()
if err != nil {
return 0, 0, ControlMessages{}, false, err
@@ -827,7 +827,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return 0, syserr.ErrAlreadyConnected
}
- n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+ n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
e.Unlock()
if notify {
@@ -1001,6 +1001,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
}
// Release implements BoundEndpoint.Release.
-func (*baseEndpoint) Release() {
+func (*baseEndpoint) Release(context.Context) {
// Binding a baseEndpoint doesn't take a reference.
}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b29e9d7f..2b8454edb 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -40,6 +40,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// SocketOperations is a Unix socket. It is similar to a netstack socket,
@@ -61,7 +62,7 @@ type SocketOperations struct {
// New creates a new unix socket.
func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
dirent := socket.NewDirent(ctx, unixSocketDevice)
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true})
}
@@ -96,17 +97,17 @@ type socketOpsCommon struct {
}
// DecRef implements RefCounter.DecRef.
-func (s *socketOpsCommon) DecRef() {
- s.DecRefWithDestructor(func() {
- s.ep.Close()
+func (s *socketOpsCommon) DecRef(ctx context.Context) {
+ s.DecRefWithDestructor(ctx, func(context.Context) {
+ s.ep.Close(ctx)
})
}
// Release implemements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release() {
+func (s *socketOpsCommon) Release(ctx context.Context) {
// Release only decrements a reference on s because s may be referenced in
// the abstract socket namespace.
- s.DecRef()
+ s.DecRef(ctx)
}
func (s *socketOpsCommon) isPacket() bool {
@@ -184,7 +185,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
}
@@ -233,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
ns := New(t, ep, s.stype)
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
flags := ns.Flags()
@@ -283,7 +284,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if t.IsNetworkNamespaced() {
return syserr.ErrInvalidEndpointState
}
- if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+ if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil {
// syserr.ErrPortInUse corresponds to EADDRINUSE.
return syserr.ErrPortInUse
}
@@ -293,7 +294,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
var name string
cwd := t.FSContext().WorkingDirectory()
- defer cwd.DecRef()
+ defer cwd.DecRef(t)
// Is there no slash at all?
if !strings.Contains(p, "/") {
@@ -301,7 +302,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
name = p
} else {
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
// Find the last path component, we know that something follows
// that final slash, otherwise extractPath() would have failed.
lastSlash := strings.LastIndex(p, "/")
@@ -317,7 +318,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
// No path available.
return syserr.ErrNoSuchFile
}
- defer d.DecRef()
+ defer d.DecRef(t)
name = p[lastSlash+1:]
}
@@ -331,7 +332,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if err != nil {
return syserr.ErrPortInUse
}
- childDir.DecRef()
+ childDir.DecRef(t)
}
return nil
@@ -377,9 +378,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
FollowFinalSymlink: true,
}
ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path})
- root.DecRef()
+ root.DecRef(t)
if relPath {
- start.DecRef()
+ start.DecRef(t)
}
if e != nil {
return nil, syserr.FromError(e)
@@ -392,15 +393,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
cwd := t.FSContext().WorkingDirectory()
remainingTraversals := uint(fs.DefaultTraversalLimit)
d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
- cwd.DecRef()
- root.DecRef()
+ cwd.DecRef(t)
+ root.DecRef(t)
if e != nil {
return nil, syserr.FromError(e)
}
// Extract the endpoint if one is there.
ep := d.Inode.BoundEndpoint(path)
- d.DecRef()
+ d.DecRef(t)
if ep == nil {
// No socket!
return nil, syserr.ErrConnectionRefused
@@ -414,10 +415,21 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
if err != nil {
return err
}
- defer ep.Release()
+ defer ep.Release(t)
// Connect the server endpoint.
- return s.ep.Connect(t, ep)
+ err = s.ep.Connect(t, ep)
+
+ if err == syserr.ErrWrongProtocolForSocket {
+ // Linux for abstract sockets returns ErrConnectionRefused
+ // instead of ErrWrongProtocolForSocket.
+ path, _ := extractPath(sockaddr)
+ if len(path) > 0 && path[0] == 0 {
+ err = syserr.ErrConnectionRefused
+ }
+ }
+
+ return err
}
// Write implements fs.FileOperations.Write.
@@ -448,15 +460,25 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
To: nil,
}
if len(to) > 0 {
- ep, err := extractEndpoint(t, to)
- if err != nil {
- return 0, err
- }
- defer ep.Release()
- w.To = ep
+ switch s.stype {
+ case linux.SOCK_SEQPACKET:
+ to = nil
+ case linux.SOCK_STREAM:
+ if s.State() == linux.SS_CONNECTED {
+ return 0, syserr.ErrAlreadyConnected
+ }
+ return 0, syserr.ErrNotSupported
+ default:
+ ep, err := extractEndpoint(t, to)
+ if err != nil {
+ return 0, err
+ }
+ defer ep.Release(t)
+ w.To = ep
- if ep.Passcred() && w.Control.Credentials == nil {
- w.Control.Credentials = control.MakeCreds(t)
+ if ep.Passcred() && w.Control.Credentials == nil {
+ w.Control.Credentials = control.MakeCreds(t)
+ }
}
}
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 45e109361..dfa25241a 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -31,6 +32,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// SocketVFS2 implements socket.SocketVFS2 (and by extension,
@@ -39,6 +41,7 @@ type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
+ vfs.LockFD
socketOpsCommon
}
@@ -51,7 +54,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
mnt := t.Kernel().SocketMount()
d := sockfs.NewDentry(t.Credentials(), mnt)
- fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d)
+ fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
if err != nil {
return nil, syserr.FromError(err)
}
@@ -60,7 +63,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
// NewFileDescription creates and returns a socket file description
// corresponding to the given mount and dentry.
-func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) {
+func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
// SOCK_DGRAM and don't require CAP_NET_RAW.
if stype == linux.SOCK_RAW {
@@ -73,6 +76,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
stype: stype,
},
}
+ sock.LockFD.Init(locks)
vfsfd := &sock.vfsfd
if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
DenyPRead: true,
@@ -86,7 +90,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
}
@@ -132,7 +136,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
if err != nil {
return 0, nil, 0, err
}
- defer ns.DecRef()
+ defer ns.DecRef(t)
if flags&linux.SOCK_NONBLOCK != 0 {
ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
@@ -179,19 +183,19 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
if t.IsNetworkNamespaced() {
return syserr.ErrInvalidEndpointState
}
- if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+ if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil {
// syserr.ErrPortInUse corresponds to EADDRINUSE.
return syserr.ErrPortInUse
}
} else {
path := fspath.Parse(p)
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
relPath := !path.Absolute
if relPath {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
}
pop := vfs.PathOperation{
Root: root,
@@ -297,6 +301,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
}
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence)
+}
+
// providerVFS2 is a unix domain socket provider for VFS2.
type providerVFS2 struct{}
@@ -319,7 +333,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int)
f, err := NewSockfsFile(t, ep, stype)
if err != nil {
- ep.Close()
+ ep.Close(t)
return nil, err
}
return f, nil
@@ -343,14 +357,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*
ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
s1, err := NewSockfsFile(t, ep1, stype)
if err != nil {
- ep1.Close()
- ep2.Close()
+ ep1.Close(t)
+ ep2.Close(t)
return nil, nil, err
}
s2, err := NewSockfsFile(t, ep2, stype)
if err != nil {
- s1.DecRef()
- ep2.Close()
+ s1.DecRef(t)
+ ep2.Close(t)
return nil, nil, err
}
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 9eb626b76..a06c9b8ab 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -60,6 +60,7 @@ type SaveOpts struct {
func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
log.Infof("Sandbox save started, pausing all tasks.")
k.Pause()
+ k.ReceiveTaskStates()
defer k.Unpause()
defer log.Infof("Tasks resumed after save.")
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index a6e48b836..5d51a7792 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -50,10 +50,10 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
sb.WriteString("...")
break
}
- if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
- fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
- continue
- }
+ // Allowing addr to overflow is consistent with Linux, and harmless; if
+ // this isn't the last iteration of the loop, the next call to CopyIn
+ // will just fail with EFAULT.
+ addr, _ = addr.AddLength(uint64(linux.SizeOfEpollEvent))
}
sb.WriteString("}")
return sb.String()
@@ -75,7 +75,7 @@ var epollEventEvents = abi.FlagSet{
{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
- {Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+ {Flag: linux.EPOLLHUP, Name: "EPOLLHUP"},
{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index c0512de89..b51c4c941 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -521,6 +521,7 @@ var sockOptNames = map[uint64]abi.ValueSet{
linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT",
linux.IP_PKTOPTIONS: "IP_PKTOPTIONS",
linux.IP_MTU: "IP_MTU",
+ linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST",
},
linux.SOL_SOCKET: {
linux.SO_ERROR: "SO_ERROR",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 68ca537c8..87b239730 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -147,14 +147,14 @@ func fd(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectory()
if root != nil {
- defer root.DecRef()
+ defer root.DecRef(t)
}
if fd == linux.AT_FDCWD {
wd := t.FSContext().WorkingDirectory()
var name string
if wd != nil {
- defer wd.DecRef()
+ defer wd.DecRef(t)
name, _ = wd.FullName(root)
} else {
name = "(unknown cwd)"
@@ -167,7 +167,7 @@ func fd(t *kernel.Task, fd int32) string {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, _ := file.Dirent.FullName(root)
return fmt.Sprintf("%#x %s", fd, name)
@@ -175,12 +175,12 @@ func fd(t *kernel.Task, fd int32) string {
func fdVFS2(t *kernel.Task, fd int32) string {
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
vfsObj := root.Mount().Filesystem().VirtualFilesystem()
if fd == linux.AT_FDCWD {
wd := t.FSContext().WorkingDirectoryVFS2()
- defer wd.DecRef()
+ defer wd.DecRef(t)
name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
return fmt.Sprintf("AT_FDCWD %s", name)
@@ -191,7 +191,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
// Cast FD to uint64 to avoid printing negative hex.
return fmt.Sprintf("%#x (bad FD)", uint64(fd))
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
return fmt.Sprintf("%#x %s", fd, name)
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index d9fb808c0..d23a0068a 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -28,7 +28,7 @@ import (
// CreateEpoll implements the epoll_create(2) linux syscall.
func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) {
file := epoll.NewEventPoll(t)
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: closeOnExec,
@@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
if epollfile == nil {
return syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Get the target file id.
file := t.GetFile(fd)
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
@@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
}
// Try to remove the entry.
- return e.RemoveEntry(epoll.FileIdentifier{file, fd})
+ return e.RemoveEntry(t, epoll.FileIdentifier{file, fd})
}
// WaitEpoll implements the epoll_wait(2) linux syscall.
@@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve
if epollfile == nil {
return nil, syserror.EBADF
}
- defer epollfile.DecRef()
+ defer epollfile.DecRef(t)
// Extract the epollPoll operations.
e, ok := epollfile.FileOperations.(*epoll.EventPoll)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 217fcfef2..4a9b04fd0 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -99,5 +99,7 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index ea4f9b1a7..80c65164a 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{
270: syscalls.Supported("pselect", Pselect),
271: syscalls.Supported("ppoll", Ppoll),
272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
- 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
- 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+ 273: syscalls.Supported("set_robust_list", SetRobustList),
+ 274: syscalls.Supported("get_robust_list", GetRobustList),
275: syscalls.Supported("splice", Splice),
276: syscalls.Supported("tee", Tee),
277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index d781d6a04..e9d64dec5 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -15,8 +15,8 @@
package linux
import (
- "encoding/binary"
-
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -27,59 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-// I/O commands.
-const (
- _IOCB_CMD_PREAD = 0
- _IOCB_CMD_PWRITE = 1
- _IOCB_CMD_FSYNC = 2
- _IOCB_CMD_FDSYNC = 3
- _IOCB_CMD_NOOP = 6
- _IOCB_CMD_PREADV = 7
- _IOCB_CMD_PWRITEV = 8
-)
-
-// I/O flags.
-const (
- _IOCB_FLAG_RESFD = 1
-)
-
-// ioCallback describes an I/O request.
-//
-// The priority field is currently ignored in the implementation below. Also
-// note that the IOCB_FLAG_RESFD feature is not supported.
-type ioCallback struct {
- Data uint64
- Key uint32
- Reserved1 uint32
-
- OpCode uint16
- ReqPrio int16
- FD int32
-
- Buf uint64
- Bytes uint64
- Offset int64
-
- Reserved2 uint64
- Flags uint32
-
- // eventfd to signal if IOCB_FLAG_RESFD is set in flags.
- ResFD int32
-}
-
-// ioEvent describes an I/O result.
-//
-// +stateify savable
-type ioEvent struct {
- Data uint64
- Obj uint64
- Result int64
- Result2 int64
-}
-
-// ioEventSize is the size of an ioEvent encoded.
-var ioEventSize = binary.Size(ioEvent{})
-
// IoSetup implements linux syscall io_setup(2).
func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nrEvents := args[0].Int()
@@ -192,7 +139,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
}
- ev := v.(*ioEvent)
+ ev := v.(*linux.IOEvent)
// Copy out the result.
if _, err := t.CopyOut(eventsAddr, ev); err != nil {
@@ -204,7 +151,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
// Keep rolling.
- eventsAddr += usermem.Addr(ioEventSize)
+ eventsAddr += usermem.Addr(linux.IOEventSize)
}
// Everything finished.
@@ -231,7 +178,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl
}
// memoryFor returns appropriate memory for the given callback.
-func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
bytes := int(cb.Bytes)
if bytes < 0 {
// Linux also requires that this field fit in ssize_t.
@@ -242,17 +189,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
// we have no guarantee that t's AddressSpace will be active during the
// I/O.
switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE:
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
AddressSpaceActive: false,
})
- case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV:
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
AddressSpaceActive: false,
})
- case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP:
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
return usermem.IOSequence{}, nil
default:
@@ -261,70 +208,78 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
}
}
-func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
- if ctx.Dead() {
- ctx.CancelPendingRequest()
- return
- }
- ev := &ioEvent{
- Data: cb.Data,
- Obj: uint64(cbAddr),
- }
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, syserror.ENOSYS
+}
- // Construct a context.Context that will not be interrupted if t is
- // interrupted.
- c := t.AsyncContext()
+// LINT.IfChange
- var err error
- switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV:
- ev.Result, err = file.Preadv(c, ioseq, cb.Offset)
- case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
- ev.Result, err = file.Pwritev(c, ioseq, cb.Offset)
- case _IOCB_CMD_FSYNC:
- err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll)
- case _IOCB_CMD_FDSYNC:
- err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData)
- }
+func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ if actx.Dead() {
+ actx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
- // Update the result.
- if err != nil {
- err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
- ev.Result = -int64(kernel.ExtractErrno(err, 0))
- }
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
+ case linux.IOCB_CMD_FSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ case linux.IOCB_CMD_FDSYNC:
+ err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
+ }
+
+ // Update the result.
+ if err != nil {
+ err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
- file.DecRef()
+ file.DecRef(ctx)
- // Queue the result for delivery.
- ctx.FinishRequest(ev)
+ // Queue the result for delivery.
+ actx.FinishRequest(ev)
- // Notify the event file if one was specified. This needs to happen
- // *after* queueing the result to avoid racing with the thread we may
- // wake up.
- if eventFile != nil {
- eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
- eventFile.DecRef()
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFile != nil {
+ eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+ eventFile.DecRef(ctx)
+ }
}
}
// submitCallback processes a single callback.
-func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
file := t.GetFile(cb.FD)
if file == nil {
// File not found.
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Was there an eventFD? Extract it.
var eventFile *fs.File
- if cb.Flags&_IOCB_FLAG_RESFD != 0 {
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
eventFile = t.GetFile(cb.ResFD)
if eventFile == nil {
// Bad FD.
return syserror.EBADF
}
- defer eventFile.DecRef()
+ defer eventFile.DecRef(t)
// Check that it is an eventfd.
if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
@@ -340,7 +295,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Check offset for reads/writes.
switch cb.OpCode {
- case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
if cb.Offset < 0 {
return syserror.EINVAL
}
@@ -366,7 +321,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
// Perform the request asynchronously.
file.IncRef()
- fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) })
+ t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))
// All set.
return nil
@@ -395,7 +350,7 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Copy in this callback.
- var cb ioCallback
+ var cb linux.IOCallback
cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
if _, err := t.CopyIn(cbAddr, &cb); err != nil {
@@ -424,10 +379,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return uintptr(nrEvents), nil, nil
}
-// IoCancel implements linux syscall io_cancel(2).
-//
-// It is not presently supported (ENOSYS indicates no support on this
-// architecture).
-func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- return 0, nil, syserror.ENOSYS
-}
+// LINT.ThenChange(vfs2/aio.go)
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index ed3413ca6..3b4f879e4 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
event.SetFlags(fs.SettableFileFlags{
NonBlocking: flags&linux.EFD_NONBLOCK != 0,
})
- defer event.DecRef()
+ defer event.DecRef(t)
fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 35a98212a..1bc9b184e 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -40,7 +40,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent,
// Common case: we are accessing a file in the root.
root := t.FSContext().RootDirectory()
err := fn(root, root, name, linux.MaxSymlinkTraversals)
- root.DecRef()
+ root.DecRef(t)
return err
} else if dir == "." && dirFD == linux.AT_FDCWD {
// Common case: we are accessing a file relative to the current
@@ -48,8 +48,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent,
wd := t.FSContext().WorkingDirectory()
root := t.FSContext().RootDirectory()
err := fn(root, wd, name, linux.MaxSymlinkTraversals)
- wd.DecRef()
- root.DecRef()
+ wd.DecRef(t)
+ root.DecRef(t)
return err
}
@@ -97,19 +97,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro
} else {
d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
}
- root.DecRef()
+ root.DecRef(t)
if wd != nil {
- wd.DecRef()
+ wd.DecRef(t)
}
if f != nil {
- f.DecRef()
+ f.DecRef(t)
}
if err != nil {
return err
}
err = fn(root, d, remainingTraversals)
- d.DecRef()
+ d.DecRef(t)
return err
}
@@ -186,7 +186,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
if err != nil {
return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Success.
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
@@ -242,7 +242,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode
if err != nil {
return err
}
- file.DecRef()
+ file.DecRef(t)
return nil
case linux.ModeNamedPipe:
@@ -332,7 +332,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
if err != nil {
break
}
- defer found.DecRef()
+ defer found.DecRef(t)
// We found something (possibly a symlink). If the
// O_EXCL flag was passed, then we can immediately
@@ -357,7 +357,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
resolved, err = found.Inode.Getlink(t)
if err == nil {
// No more resolution necessary.
- defer resolved.DecRef()
+ defer resolved.DecRef(t)
break
}
if err != fs.ErrResolveViaReadlink {
@@ -384,7 +384,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
if err != nil {
break
}
- defer newParent.DecRef()
+ defer newParent.DecRef(t)
// Repeat the process with the parent and name of the
// symlink target.
@@ -416,7 +416,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
if err != nil {
return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
- defer newFile.DecRef()
+ defer newFile.DecRef(t)
case syserror.ENOENT:
// File does not exist. Proceed with creation.
@@ -432,7 +432,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
// No luck, bail.
return err
}
- defer newFile.DecRef()
+ defer newFile.DecRef(t)
found = newFile.Dirent
default:
return err
@@ -596,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Shared flags between file and socket.
switch request {
@@ -671,9 +671,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
addr := args[0].Pointer()
size := args[1].SizeT()
cwd := t.FSContext().WorkingDirectory()
- defer cwd.DecRef()
+ defer cwd.DecRef(t)
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
// Get our fullname from the root and preprend unreachable if the root was
// unreachable from our current dirent this is the same behavior as on linux.
@@ -722,7 +722,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return err
}
- t.FSContext().SetRootDirectory(d)
+ t.FSContext().SetRootDirectory(t, d)
return nil
})
}
@@ -747,7 +747,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return err
}
- t.FSContext().SetWorkingDirectory(d)
+ t.FSContext().SetWorkingDirectory(t, d)
return nil
})
}
@@ -760,7 +760,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is it a directory?
if !fs.IsDir(file.Dirent.Inode.StableAttr) {
@@ -772,7 +772,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, err
}
- t.FSContext().SetWorkingDirectory(file.Dirent)
+ t.FSContext().SetWorkingDirectory(t, file.Dirent)
return 0, nil, nil
}
@@ -791,7 +791,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Flush(t)
return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
@@ -805,7 +805,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
if err != nil {
@@ -826,7 +826,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if oldFile == nil {
return 0, nil, syserror.EBADF
}
- defer oldFile.DecRef()
+ defer oldFile.DecRef(t)
return uintptr(newfd), nil, nil
}
@@ -850,7 +850,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if oldFile == nil {
return 0, nil, syserror.EBADF
}
- defer oldFile.DecRef()
+ defer oldFile.DecRef(t)
err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
if err != nil {
@@ -900,14 +900,20 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 {
//
// If who is positive, it represents a PID. If negative, it represents a PGID.
// If the PID or PGID is invalid, the owner is silently unset.
-func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) error {
a := file.Async(fasync.New).(*fasync.FileAsync)
if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return syserror.EINVAL
+ }
pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
a.SetOwnerProcessGroup(t, pg)
+ } else {
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+ a.SetOwnerThreadGroup(t, tg)
}
- tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
- a.SetOwnerThreadGroup(t, tg)
+ return nil
}
// Fcntl implements linux syscall fcntl(2).
@@ -919,7 +925,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
@@ -935,10 +941,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
- return 0, nil, nil
+ return 0, nil, err
case linux.F_GETFL:
return uintptr(file.Flags().ToLinux()), nil, nil
case linux.F_SETFL:
@@ -998,9 +1004,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
- // The lock uid is that of the Task's FDTable.
- lockUniqueID := lock.UniqueID(t.FDTable().ID())
-
// These locks don't block; execute the non-blocking operation using the inode's lock
// context directly.
switch flock.Type {
@@ -1010,12 +1013,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
return 0, nil, syserror.EAGAIN
}
} else {
// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
@@ -1026,18 +1029,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if cmd == linux.F_SETLK {
// Non-blocking lock, provide a nil lock.Blocker.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
return 0, nil, syserror.EAGAIN
}
} else {
// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
return 0, nil, nil
case linux.F_UNLCK:
- file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
return 0, nil, nil
default:
return 0, nil, syserror.EINVAL
@@ -1045,8 +1048,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_GETOWN:
return uintptr(fGetOwn(t, file)), nil, nil
case linux.F_SETOWN:
- fSetOwn(t, file, args[2].Int())
- return 0, nil, nil
+ return 0, nil, fSetOwn(t, file, args[2].Int())
case linux.F_GETOWN_EX:
addr := args[2].Pointer()
owner := fGetOwnEx(t, file)
@@ -1055,7 +1057,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_SETOWN_EX:
addr := args[2].Pointer()
var owner linux.FOwnerEx
- n, err := t.CopyIn(addr, &owner)
+ _, err := t.CopyIn(addr, &owner)
if err != nil {
return 0, nil, err
}
@@ -1067,21 +1069,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.ESRCH
}
a.SetOwnerTask(t, task)
- return uintptr(n), nil, nil
+ return 0, nil, nil
case linux.F_OWNER_PID:
tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
if tg == nil {
return 0, nil, syserror.ESRCH
}
a.SetOwnerThreadGroup(t, tg)
- return uintptr(n), nil, nil
+ return 0, nil, nil
case linux.F_OWNER_PGRP:
pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
if pg == nil {
return 0, nil, syserror.ESRCH
}
a.SetOwnerProcessGroup(t, pg)
- return uintptr(n), nil, nil
+ return 0, nil, nil
default:
return 0, nil, syserror.EINVAL
}
@@ -1114,17 +1116,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
-// LINT.ThenChange(vfs2/fd.go)
-
-const (
- _FADV_NORMAL = 0
- _FADV_RANDOM = 1
- _FADV_SEQUENTIAL = 2
- _FADV_WILLNEED = 3
- _FADV_DONTNEED = 4
- _FADV_NOREUSE = 5
-)
-
// Fadvise64 implements linux syscall fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
@@ -1141,7 +1132,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// If the FD refers to a pipe or FIFO, return error.
if fs.IsPipe(file.Dirent.Inode.StableAttr) {
@@ -1149,12 +1140,12 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
}
switch advice {
- case _FADV_NORMAL:
- case _FADV_RANDOM:
- case _FADV_SEQUENTIAL:
- case _FADV_WILLNEED:
- case _FADV_DONTNEED:
- case _FADV_NOREUSE:
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
default:
return 0, nil, syserror.EINVAL
}
@@ -1163,8 +1154,6 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
-// LINT.IfChange
-
func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@@ -1182,7 +1171,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode
switch err {
case nil:
// The directory existed.
- defer f.DecRef()
+ defer f.DecRef(t)
return syserror.EEXIST
case syserror.EACCES:
// Permission denied while walking to the directory.
@@ -1360,7 +1349,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32
if target == nil {
return syserror.EBADF
}
- defer target.DecRef()
+ defer target.DecRef(t)
if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
return err
}
@@ -1613,7 +1602,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Reject truncation if the file flags do not permit this operation.
// This is different from truncate(2) above.
@@ -1741,7 +1730,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo
if file == nil {
return syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return chown(t, file.Dirent, uid, gid)
}
@@ -1779,7 +1768,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, chown(t, file.Dirent, uid, gid)
}
@@ -1844,7 +1833,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, chmod(t, file.Dirent, mode)
}
@@ -1904,10 +1893,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso
if f == nil {
return syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
}
@@ -2099,7 +2088,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
if offset < 0 || length <= 0 {
return 0, nil, syserror.EINVAL
@@ -2152,27 +2141,11 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// flock(2): EBADF fd is not an open file descriptor.
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
nonblocking := operation&linux.LOCK_NB != 0
operation &^= linux.LOCK_NB
- // flock(2):
- // Locks created by flock() are associated with an open file table entry. This means that
- // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
- // same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
- // the lock is released either by an explicit LOCK_UN operation on any of these duplicate
- // descriptors, or when all such descriptors have been closed.
- //
- // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
- // these descriptors are treated independently by flock(). An attempt to lock the file using
- // one of these file descriptors may be denied by a lock that the calling process has already placed via
- // another descriptor.
- //
- // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
- // and fork(2).
- lockUniqueID := lock.UniqueID(file.UniqueID)
-
// A BSD style lock spans the entire file.
rng := lock.LockRange{
Start: 0,
@@ -2183,29 +2156,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.LOCK_EX:
if nonblocking {
// Since we're nonblocking we pass a nil lock.Blocker implementation.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
return 0, nil, syserror.EWOULDBLOCK
}
} else {
// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
case linux.LOCK_SH:
if nonblocking {
// Since we're nonblocking we pass a nil lock.Blocker implementation.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
return 0, nil, syserror.EWOULDBLOCK
}
} else {
// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
- if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+ if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
return 0, nil, syserror.EINTR
}
}
case linux.LOCK_UN:
- file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+ file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
default:
// flock(2): EINVAL operation is invalid.
return 0, nil, syserror.EINVAL
@@ -2251,8 +2224,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, err
}
- defer dirent.DecRef()
- defer file.DecRef()
+ defer dirent.DecRef(t)
+ defer file.DecRef(t)
newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b68261f72..9d1b2edb1 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -73,7 +73,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo
err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
}
- t.Futex().WaitComplete(w)
+ t.Futex().WaitComplete(w, t)
return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
@@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
}
remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
- t.Futex().WaitComplete(w)
+ t.Futex().WaitComplete(w, t)
if err == nil {
return 0, nil
}
@@ -148,7 +148,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A
timer.Destroy()
}
- t.Futex().WaitComplete(w)
+ t.Futex().WaitComplete(w, t)
return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
@@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
switch cmd {
case linux.FUTEX_WAIT:
// WAIT uses a relative timeout.
- mask = ^uint32(0)
+ mask = linux.FUTEX_BITSET_MATCH_ANY
var timeoutDur time.Duration
if !forever {
timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
@@ -286,3 +286,49 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.ENOSYS
}
}
+
+// SetRobustList implements linux syscall set_robust_list(2).
+func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // Despite the syscall using the name 'pid' for this variable, it is
+ // very much a tid.
+ head := args[0].Pointer()
+ length := args[1].SizeT()
+
+ if length != uint(linux.SizeOfRobustListHead) {
+ return 0, nil, syserror.EINVAL
+ }
+ t.SetRobustList(head)
+ return 0, nil, nil
+}
+
+// GetRobustList implements linux syscall get_robust_list(2).
+func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ // Despite the syscall using the name 'pid' for this variable, it is
+ // very much a tid.
+ tid := args[0].Int()
+ head := args[1].Pointer()
+ size := args[2].Pointer()
+
+ if tid < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ot := t
+ if tid != 0 {
+ if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ }
+
+ // Copy out head pointer.
+ if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil {
+ return 0, nil, err
+ }
+
+ // Copy out size, which is a constant.
+ if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil {
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index b126fecc0..f5699e55d 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -68,7 +68,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
if dir == nil {
return 0, syserror.EBADF
}
- defer dir.DecRef()
+ defer dir.DecRef(t)
w := &usermem.IOReadWriter{
Ctx: t,
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index b2c7b3444..cf47bb9dd 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
NonBlocking: flags&linux.IN_NONBLOCK != 0,
}
n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
- defer n.DecRef()
+ defer n.DecRef(t)
fd, err := t.NewFDFrom(0, n, kernel.FDFlags{
CloseOnExec: flags&linux.IN_CLOEXEC != 0,
@@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) {
ino, ok := file.FileOperations.(*fs.Inotify)
if !ok {
// Not an inotify fd.
- file.DecRef()
+ file.DecRef(t)
return nil, nil, syserror.EINVAL
}
@@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
- return 0, nil, ino.RmWatch(wd)
+ defer file.DecRef(t)
+ return 0, nil, ino.RmWatch(t, wd)
}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 3f7691eae..1c38f8f4f 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
var sw fs.SeekWhence
switch whence {
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 91694d374..72786b032 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
}
defer func() {
if opts.MappingIdentity != nil {
- opts.MappingIdentity.DecRef()
+ opts.MappingIdentity.DecRef(t)
}
}()
@@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
flags := file.Flags()
// mmap unconditionally requires that the FD is readable.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index eb5ff48f5..bd0633564 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}); err != nil {
// Something went wrong. Drop our ref on rootInode before
// returning the error.
- rootInode.DecRef()
+ rootInode.DecRef(t)
return 0, nil, err
}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 43c510930..3149e4aad 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -34,10 +34,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
r.SetFlags(linuxToFlags(flags).Settable())
- defer r.DecRef()
+ defer r.DecRef(t)
w.SetFlags(linuxToFlags(flags).Settable())
- defer w.DecRef()
+ defer w.DecRef(t)
fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -49,7 +49,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
if _, err := t.CopyOut(addr, fds); err != nil {
for _, fd := range fds {
if file, _ := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ file.DecRef(t)
}
}
return 0, err
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index f0198141c..3435bdf77 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
if ch == nil {
- defer file.DecRef()
+ defer file.DecRef(t)
} else {
state.file = file
state.waiter, _ = waiter.NewChannelEntry(ch)
@@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
// releaseState releases all the pollState in "state".
-func releaseState(state []pollState) {
+func releaseState(t *kernel.Task, state []pollState) {
for i := range state {
if state[i].file != nil {
state[i].file.EventUnregister(&state[i].waiter)
- state[i].file.DecRef()
+ state[i].file.DecRef(t)
}
}
}
@@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
// result, we stop registering for events but still go through all files
// to get their ready masks.
state := make([]pollState, len(pfd))
- defer releaseState(state)
+ defer releaseState(t, state)
n := uintptr(0)
for i := range pfd {
initReadiness(t, &pfd[i], &state[i], ch)
@@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
if file == nil {
return 0, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
var mask int16
if (rV & m) != 0 {
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index f92bf8096..64a725296 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -128,7 +128,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// They trying to set exe to a non-file?
if !fs.IsFile(file.Dirent.Inode.StableAttr) {
@@ -136,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
// Set the underlying executable.
- t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
+ t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file))
case linux.PR_SET_MM_AUXV,
linux.PR_SET_MM_START_CODE,
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 071b4bacc..3bbc3fa4b 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is readable.
if !file.Flags().Read {
@@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 4a8bc24a2..f0ae8fa8e 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
return uintptr(segment.ID), nil, nil
}
@@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -108,7 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
stat, err := segment.IPCStat(t)
if err == nil {
@@ -132,7 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
- defer segment.DecRef()
+ defer segment.DecRef(t)
switch cmd {
case linux.IPC_SET:
@@ -145,7 +145,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, err
case linux.IPC_RMID:
- segment.MarkDestroyed()
+ segment.MarkDestroyed(t)
return 0, nil, nil
case linux.SHM_LOCK, linux.SHM_UNLOCK:
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index d2b0012ae..20cb1a5cb 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is this a signalfd?
if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
@@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Set appropriate flags.
file.SetFlags(fs.SettableFileFlags{
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 0760af77b..fec1c1974 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -29,6 +29,8 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// LINT.IfChange
@@ -198,7 +200,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
s.SetFlags(fs.SettableFileFlags{
NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
})
- defer s.DecRef()
+ defer s.DecRef(t)
fd, err := t.NewFDFrom(0, s, kernel.FDFlags{
CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
@@ -233,8 +235,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
s1.SetFlags(fileFlags)
s2.SetFlags(fileFlags)
- defer s1.DecRef()
- defer s2.DecRef()
+ defer s1.DecRef(t)
+ defer s2.DecRef(t)
// Create the FDs for the sockets.
fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{
@@ -248,7 +250,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if _, err := t.CopyOut(socks, fds); err != nil {
for _, fd := range fds {
if file, _ := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ file.DecRef(t)
}
}
return 0, nil, err
@@ -268,7 +270,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -299,7 +301,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -358,7 +360,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -385,7 +387,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -414,7 +416,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -445,7 +447,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -474,7 +476,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
if v != nil {
- if _, err := t.CopyOut(optValAddr, v); err != nil {
+ if _, err := v.CopyOut(t, optValAddr); err != nil {
return 0, nil, err
}
}
@@ -484,7 +486,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
-func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
if level == linux.SOL_SOCKET {
switch name {
case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -496,13 +498,16 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr use
switch name {
case linux.SO_TYPE:
_, skType, _ := s.Type()
- return int32(skType), nil
+ v := primitive.Int32(skType)
+ return &v, nil
case linux.SO_DOMAIN:
family, _, _ := s.Type()
- return int32(family), nil
+ v := primitive.Int32(family)
+ return &v, nil
case linux.SO_PROTOCOL:
_, _, protocol := s.Type()
- return int32(protocol), nil
+ v := primitive.Int32(protocol)
+ return &v, nil
}
}
@@ -524,7 +529,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -539,7 +544,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, syserror.EINVAL
}
buf := t.CopyScratchBuffer(int(optLen))
- if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
return 0, nil, err
}
@@ -562,7 +567,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -590,7 +595,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -623,7 +628,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -676,7 +681,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -770,7 +775,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
if !cms.Unix.Empty() {
mflags |= linux.MSG_CTRUNC
- cms.Release()
+ cms.Release(t)
}
if int(msg.Flags) != mflags {
@@ -790,7 +795,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
- defer cms.Release()
+ defer cms.Release(t)
controlData := make([]byte, 0, msg.ControlLen)
controlData = control.PackControlMessages(t, cms, controlData)
@@ -846,7 +851,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -875,7 +880,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
}
n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
- cm.Release()
+ cm.Release(t)
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
@@ -919,7 +924,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -957,7 +962,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
@@ -1061,7 +1066,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
if err != nil {
- controlMessages.Release()
+ controlMessages.Release(t)
}
return uintptr(n), err
}
@@ -1079,7 +1084,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.FileOperations.(socket.Socket)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 39f2b79ec..b8846a10a 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -80,6 +80,12 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
}
}
+ if total > 0 {
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+ }
return total, err
}
@@ -95,7 +101,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
if !inFile.Flags().Read {
return 0, nil, syserror.EBADF
@@ -105,7 +111,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
if !outFile.Flags().Write {
return 0, nil, syserror.EBADF
@@ -186,13 +192,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
// The operation is non-blocking if anything is non-blocking.
//
@@ -294,13 +300,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if outFile == nil {
return 0, nil, syserror.EBADF
}
- defer outFile.DecRef()
+ defer outFile.DecRef(t)
inFile := t.GetFile(inFD)
if inFile == nil {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
+ defer inFile.DecRef(t)
// All files must be pipes.
if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) {
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 46ebf27a2..a5826f2dd 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, fstat(t, file, statAddr)
}
@@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, fstat(t, file, statAddr)
}
@@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
uattr, err := file.UnstableAttr(t)
if err != nil {
return 0, nil, err
@@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 5ad465ae3..f2c0e5069 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Use "sync-the-world" for now, it's guaranteed that fd is at least
// on the root filesystem.
@@ -54,7 +54,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
@@ -70,7 +70,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
@@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the
// specified range that have already been submitted to the device
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 00915fdde..2d16e4933 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -117,7 +117,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
root := t.FSContext().RootDirectory()
- defer root.DecRef()
+ defer root.DecRef(t)
var wd *fs.Dirent
var executable fsbridge.File
@@ -133,7 +133,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
closeOnExec = fdFlags.CloseOnExec
if atEmptyPath && len(pathname) == 0 {
@@ -155,7 +155,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
}
}
if wd != nil {
- defer wd.DecRef()
+ defer wd.DecRef(t)
}
// Load the new TaskContext.
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index cf49b43db..34b03e4ee 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.EINVAL
}
f := timerfd.NewFile(t, c)
- defer f.DecRef()
+ defer f.DecRef(t)
f.SetFlags(fs.SettableFileFlags{
NonBlocking: flags&linux.TFD_NONBLOCK != 0,
})
@@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
tf, ok := f.FileOperations.(*timerfd.TimerOperations)
if !ok {
@@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
tf, ok := f.FileOperations.(*timerfd.TimerOperations)
if !ok {
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 6ec0de96e..485526e28 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is writable.
if !file.Flags().Write {
@@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the file is writable.
if !file.Flags().Write {
@@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 2de5e3422..97474fd3c 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size)
if err != nil {
@@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
}
@@ -207,7 +207,11 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
return syserror.EOPNOTSUPP
}
- return d.Inode.SetXattr(t, d, name, value, flags)
+ if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+ return err
+ }
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+ return nil
}
func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
@@ -266,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
n, err := listXattr(t, f.Dirent, listAddr, size)
if err != nil {
@@ -380,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if f == nil {
return 0, nil, syserror.EBADF
}
- defer f.DecRef()
+ defer f.DecRef(t)
return 0, nil, removeXattr(t, f.Dirent, nameAddr)
}
@@ -418,7 +422,11 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
return syserror.EOPNOTSUPP
}
- return d.Inode.RemoveXattr(t, d, name)
+ if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+ return err
+ }
+ d.InotifyEvent(linux.IN_ATTRIB, 0)
+ return nil
}
// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f882ef840..64696b438 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
go_library(
name = "vfs2",
srcs = [
+ "aio.go",
"epoll.go",
"eventfd.go",
"execve.go",
@@ -12,9 +13,12 @@ go_library(
"filesystem.go",
"fscontext.go",
"getdents.go",
+ "inotify.go",
"ioctl.go",
+ "lock.go",
"memfd.go",
"mmap.go",
+ "mount.go",
"path.go",
"pipe.go",
"poll.go",
@@ -22,6 +26,7 @@ go_library(
"setstat.go",
"signal.go",
"socket.go",
+ "splice.go",
"stat.go",
"stat_amd64.go",
"stat_arm64.go",
@@ -36,9 +41,11 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/bits",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/gohacks",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/fsbridge",
"//pkg/sentry/fsimpl/eventfd",
"//pkg/sentry/fsimpl/pipefs",
@@ -47,11 +54,13 @@ go_library(
"//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/fasync",
"//pkg/sentry/kernel/pipe",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/loader",
"//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
"//pkg/sentry/socket",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix/transport",
@@ -63,5 +72,7 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
+ "//tools/go_marshal/primitive",
],
)
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
new file mode 100644
index 000000000..42559bf69
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -0,0 +1,219 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ id := args[0].Uint64()
+ nrEvents := args[1].Int()
+ addr := args[2].Pointer()
+
+ if nrEvents < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ for i := int32(0); i < nrEvents; i++ {
+ // Copy in the address.
+ cbAddrNative := t.Arch().Native(0)
+ if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+ if i > 0 {
+ // Some successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Copy in this callback.
+ var cb linux.IOCallback
+ cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+ if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+ if i > 0 {
+ // Some have been successful.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Process this callback.
+ if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+ if i > 0 {
+ // Partial success.
+ return uintptr(i), nil, nil
+ }
+ // Nothing done.
+ return 0, nil, err
+ }
+
+ // Advance to the next one.
+ addr += usermem.Addr(t.Arch().Width())
+ }
+
+ return uintptr(nrEvents), nil, nil
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error {
+ if cb.Reserved2 != 0 {
+ return syserror.EINVAL
+ }
+
+ fd := t.GetFileVFS2(cb.FD)
+ if fd == nil {
+ return syserror.EBADF
+ }
+ defer fd.DecRef(t)
+
+ // Was there an eventFD? Extract it.
+ var eventFD *vfs.FileDescription
+ if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
+ eventFD = t.GetFileVFS2(cb.ResFD)
+ if eventFD == nil {
+ return syserror.EBADF
+ }
+ defer eventFD.DecRef(t)
+
+ // Check that it is an eventfd.
+ if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
+ return syserror.EINVAL
+ }
+ }
+
+ ioseq, err := memoryFor(t, cb)
+ if err != nil {
+ return err
+ }
+
+ // Check offset for reads/writes.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ if cb.Offset < 0 {
+ return syserror.EINVAL
+ }
+ }
+
+ // Prepare the request.
+ aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ready := aioCtx.Prepare(); !ready {
+ // Context is busy.
+ return syserror.EAGAIN
+ }
+
+ if eventFD != nil {
+ // The request is set. Make sure there's a ref on the file.
+ //
+ // This is necessary when the callback executes on completion,
+ // which is also what will release this reference.
+ eventFD.IncRef()
+ }
+
+ // Perform the request asynchronously.
+ fd.IncRef()
+ t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
+ return nil
+}
+
+func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
+ return func(ctx context.Context) {
+ // Release references after completing the callback.
+ defer fd.DecRef(ctx)
+ if eventFD != nil {
+ defer eventFD.DecRef(ctx)
+ }
+
+ if aioCtx.Dead() {
+ aioCtx.CancelPendingRequest()
+ return
+ }
+ ev := &linux.IOEvent{
+ Data: cb.Data,
+ Obj: uint64(cbAddr),
+ }
+
+ var err error
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
+ ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
+ case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
+ ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
+ err = fd.Sync(ctx)
+ }
+
+ // Update the result.
+ if err != nil {
+ err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
+ ev.Result = -int64(kernel.ExtractErrno(err, 0))
+ }
+
+ // Queue the result for delivery.
+ aioCtx.FinishRequest(ev)
+
+ // Notify the event file if one was specified. This needs to happen
+ // *after* queueing the result to avoid racing with the thread we may
+ // wake up.
+ if eventFD != nil {
+ eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
+ }
+ }
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
+ bytes := int(cb.Bytes)
+ if bytes < 0 {
+ // Linux also requires that this field fit in ssize_t.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+
+ // Since this I/O will be asynchronous with respect to t's task goroutine,
+ // we have no guarantee that t's AddressSpace will be active during the
+ // I/O.
+ switch cb.OpCode {
+ case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
+ return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
+ return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+ AddressSpaceActive: false,
+ })
+
+ case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
+ return usermem.IOSequence{}, nil
+
+ default:
+ // Not a supported command.
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index 34c90ae3e..c62f03509 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -37,11 +37,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
return 0, nil, syserror.EINVAL
}
- file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
@@ -62,11 +62,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
return 0, nil, syserror.EINVAL
}
- file, err := t.Kernel().VFS().NewEpollInstanceFD()
+ file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
@@ -86,7 +86,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if epfile == nil {
return 0, nil, syserror.EBADF
}
- defer epfile.DecRef()
+ defer epfile.DecRef(t)
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
@@ -95,7 +95,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
if epfile == file {
return 0, nil, syserror.EINVAL
}
@@ -135,7 +135,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if epfile == nil {
return 0, nil, syserror.EBADF
}
- defer epfile.DecRef()
+ defer epfile.DecRef(t)
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
index aff1a2070..807f909da 100644
--- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
@@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
fileFlags |= linux.O_NONBLOCK
}
semMode := flags&linux.EFD_SEMAPHORE != 0
- eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags)
+ eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags)
if err != nil {
return 0, nil, err
}
- defer eventfd.DecRef()
+ defer eventfd.DecRef(t)
fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
index aef0078a8..066ee0863 100644
--- a/pkg/sentry/syscalls/linux/vfs2/execve.go
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
var executable fsbridge.File
closeOnExec := false
if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
@@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
}
start := dirfile.VirtualDentry()
start.IncRef()
- dirfile.DecRef()
+ dirfile.DecRef(t)
closeOnExec = dirfileFlags.CloseOnExec
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
@@ -101,19 +101,19 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
Flags: linux.O_RDONLY,
FileExec: true,
})
- start.DecRef()
+ start.DecRef(t)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
executable = fsbridge.NewVFSFile(file)
}
// Load the new TaskContext.
mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
- defer mntns.DecRef()
+ defer mntns.DecRef(t)
wd := t.FSContext().WorkingDirectoryVFS2()
- defer wd.DecRef()
+ defer wd.DecRef(t)
remainingTraversals := uint(linux.MaxSymlinkTraversals)
loadArgs := loader.LoadArgs{
Opener: fsbridge.NewVFSLookup(mntns, root, wd),
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index ca0f7fd1e..4856554fe 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -17,10 +17,13 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -35,7 +38,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := file.OnClose(t)
return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
@@ -49,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
@@ -69,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
return uintptr(newfd), nil, nil
}
@@ -98,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -118,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
@@ -134,10 +137,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
- t.FDTable().SetFlags(fd, kernel.FDFlags{
+ err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
- return 0, nil, nil
+ return 0, nil, err
case linux.F_GETFL:
return uintptr(file.StatusFlags()), nil, nil
case linux.F_SETFL:
@@ -152,6 +155,41 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, err
}
return uintptr(n), nil, nil
+ case linux.F_GETOWN:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ if owner.Type == linux.F_OWNER_PGRP {
+ return uintptr(-owner.PID), nil, nil
+ }
+ return uintptr(owner.PID), nil, nil
+ case linux.F_SETOWN:
+ who := args[2].Int()
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ case linux.F_GETOWN_EX:
+ owner, hasOwner := getAsyncOwner(t, file)
+ if !hasOwner {
+ return 0, nil, nil
+ }
+ _, err := t.CopyOut(args[2].Pointer(), &owner)
+ return 0, nil, err
+ case linux.F_SETOWN_EX:
+ var owner linux.FOwnerEx
+ _, err := t.CopyIn(args[2].Pointer(), &owner)
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
case linux.F_GETPIPE_SZ:
pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
if !ok {
@@ -167,8 +205,151 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
err := tmpfs.AddSeals(file, args[2].Uint())
return 0, nil, err
+ case linux.F_SETLK, linux.F_SETLKW:
+ return 0, nil, posixLock(t, args, file, cmd)
default:
- // TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+ // Everything else is not yet supported.
return 0, nil, syserror.EINVAL
}
}
+
+func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
+ a := fd.AsyncHandler()
+ if a == nil {
+ return linux.FOwnerEx{}, false
+ }
+
+ ot, otg, opg := a.(*fasync.FileAsync).Owner()
+ switch {
+ case ot != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_TID,
+ PID: int32(t.PIDNamespace().IDOfTask(ot)),
+ }, true
+ case otg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PID,
+ PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+ }, true
+ case opg != nil:
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PGRP,
+ PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+ }, true
+ default:
+ return linux.FOwnerEx{}, true
+ }
+}
+
+func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error {
+ switch ownerType {
+ case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
+ // Acceptable type.
+ default:
+ return syserror.EINVAL
+ }
+
+ a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync)
+ if pid == 0 {
+ a.ClearOwner()
+ return nil
+ }
+
+ switch ownerType {
+ case linux.F_OWNER_TID:
+ task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
+ if task == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerTask(t, task)
+ return nil
+ case linux.F_OWNER_PID:
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
+ if tg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerThreadGroup(t, tg)
+ return nil
+ case linux.F_OWNER_PGRP:
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
+ if pg == nil {
+ return syserror.ESRCH
+ }
+ a.SetOwnerProcessGroup(t, pg)
+ return nil
+ default:
+ return syserror.EINVAL
+ }
+}
+
+func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error {
+ // Copy in the lock request.
+ flockAddr := args[2].Pointer()
+ var flock linux.Flock
+ if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+ return err
+ }
+
+ var blocker lock.Blocker
+ if cmd == linux.F_SETLKW {
+ blocker = t
+ }
+
+ switch flock.Type {
+ case linux.F_RDLCK:
+ if !file.IsReadable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_WRLCK:
+ if !file.IsWritable() {
+ return syserror.EBADF
+ }
+ return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker)
+
+ case linux.F_UNLCK:
+ return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence)
+
+ default:
+ return syserror.EINVAL
+ }
+}
+
+// Fadvise64 implements fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ length := args[2].Int64()
+ advice := args[3].Int()
+
+ // Note: offset is allowed to be negative.
+ if length < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ // If the FD refers to a pipe or FIFO, return error.
+ if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+
+ switch advice {
+ case linux.POSIX_FADV_NORMAL:
+ case linux.POSIX_FADV_RANDOM:
+ case linux.POSIX_FADV_SEQUENTIAL:
+ case linux.POSIX_FADV_WILLNEED:
+ case linux.POSIX_FADV_DONTNEED:
+ case linux.POSIX_FADV_NOREUSE:
+ default:
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Sure, whatever.
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index 46d3e189c..01e0f9010 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -56,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i
if err != nil {
return err
}
- defer oldtpop.Release()
+ defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
@@ -66,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i
if err != nil {
return err
}
- defer newtpop.Release()
+ defer newtpop.Release(t)
return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
}
@@ -95,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
@@ -106,7 +106,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
addr := args[0].Pointer()
mode := args[1].ModeT()
dev := args[2].Uint()
- return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+ return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
}
// Mknodat implements Linux syscall mknodat(2).
@@ -115,10 +115,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
addr := args[1].Pointer()
mode := args[2].ModeT()
dev := args[3].Uint()
- return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+ return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
}
-func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
@@ -127,10 +127,15 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
+
+ // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+ if mode.FileType() == 0 {
+ mode |= linux.ModeRegular
+ }
major, minor := linux.DecodeDeviceID(dev)
return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
- Mode: linux.FileMode(mode &^ t.FSContext().Umask()),
+ Mode: mode &^ linux.FileMode(t.FSContext().Umask()),
DevMajor: uint32(major),
DevMinor: minor,
})
@@ -169,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
Flags: flags | linux.O_LARGEFILE,
@@ -178,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -222,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd
if err != nil {
return err
}
- defer oldtpop.Release()
+ defer oldtpop.Release(t)
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
@@ -232,7 +237,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd
if err != nil {
return err
}
- defer newtpop.Release()
+ defer newtpop.Release(t)
return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
Flags: flags,
@@ -254,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
}
@@ -273,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
}
@@ -313,6 +318,9 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath
if err != nil {
return err
}
+ if len(target) == 0 {
+ return syserror.ENOENT
+ }
linkpath, err := copyInPath(t, linkpathAddr)
if err != nil {
return err
@@ -321,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
index 317409a18..a7d4d2a36 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
root := t.FSContext().RootDirectoryVFS2()
wd := t.FSContext().WorkingDirectoryVFS2()
s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
- root.DecRef()
- wd.DecRef()
+ root.DecRef(t)
+ wd.DecRef(t)
if err != nil {
return 0, nil, err
}
@@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- t.FSContext().SetWorkingDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetWorkingDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
@@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- t.FSContext().SetWorkingDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetWorkingDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
@@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
@@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- t.FSContext().SetRootDirectoryVFS2(vd)
- vd.DecRef()
+ t.FSContext().SetRootDirectoryVFS2(t, vd)
+ vd.DecRef(t)
return 0, nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index c7c7bf7ce..5517595b5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
cb := getGetdentsCallback(t, addr, size, isGetdents64)
err := file.IterDirents(t, cb)
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..11753d8e5
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags&^allFlags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+ if err != nil {
+ return 0, nil, err
+ }
+ defer ino.DecRef(t)
+
+ fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+ CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+ })
+
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[0].Value = 0
+ return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+ f := t.GetFileVFS2(fd)
+ if f == nil {
+ // Invalid fd.
+ return nil, nil, syserror.EBADF
+ }
+
+ ino, ok := f.Impl().(*vfs.Inotify)
+ if !ok {
+ // Not an inotify fd.
+ f.DecRef(t)
+ return nil, nil, syserror.EINVAL
+ }
+
+ return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mask := args[2].Uint()
+
+ // "EINVAL: The given event mask contains no valid events."
+ // -- inotify_add_watch(2)
+ if mask&linux.ALL_INOTIFY_BITS == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+ // -- inotify(7)
+ follow := followFinalSymlink
+ if mask&linux.IN_DONT_FOLLOW == 0 {
+ follow = nofollowFinalSymlink
+ }
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef(t)
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if mask&linux.IN_ONLYDIR != 0 {
+ path.Dir = true
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release(t)
+ d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ return 0, nil, err
+ }
+ defer d.DecRef(t)
+
+ fd, err = ino.AddWatch(d.Dentry(), mask)
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ wd := args[1].Int()
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef(t)
+ return 0, nil, ino.RmWatch(t, wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 5a2418da9..38778a388 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -15,6 +15,7 @@
package vfs2
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
@@ -28,7 +29,78 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
+
+ // Handle ioctls that apply to all FDs.
+ switch args[1].Int() {
+ case linux.FIONCLEX:
+ t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ CloseOnExec: false,
+ })
+ return 0, nil, nil
+
+ case linux.FIOCLEX:
+ t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+ CloseOnExec: true,
+ })
+ return 0, nil, nil
+
+ case linux.FIONBIO:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_NONBLOCK
+ } else {
+ flags &^= linux.O_NONBLOCK
+ }
+ return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
+
+ case linux.FIOASYNC:
+ var set int32
+ if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+ return 0, nil, err
+ }
+ flags := file.StatusFlags()
+ if set != 0 {
+ flags |= linux.O_ASYNC
+ } else {
+ flags &^= linux.O_ASYNC
+ }
+ file.SetStatusFlags(t, t.Credentials(), flags)
+ return 0, nil, nil
+
+ case linux.FIOGETOWN, linux.SIOCGPGRP:
+ var who int32
+ owner, hasOwner := getAsyncOwner(t, file)
+ if hasOwner {
+ if owner.Type == linux.F_OWNER_PGRP {
+ who = -owner.PID
+ } else {
+ who = owner.PID
+ }
+ }
+ _, err := t.CopyOut(args[2].Pointer(), &who)
+ return 0, nil, err
+
+ case linux.FIOSETOWN, linux.SIOCSPGRP:
+ var who int32
+ if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+ return 0, nil, err
+ }
+ ownerType := int32(linux.F_OWNER_PID)
+ if who < 0 {
+ // Check for overflow before flipping the sign.
+ if who-1 > who {
+ return 0, nil, syserror.EINVAL
+ }
+ ownerType = linux.F_OWNER_PGRP
+ who = -who
+ }
+ return 0, nil, setAsyncOwner(t, file, ownerType, who)
+ }
ret, err := file.Ioctl(t, t.MemoryManager(), args)
return ret, nil, err
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..b910b5a74
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ operation := args[1].Int()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ // flock(2): EBADF fd is not an open file descriptor.
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ nonblocking := operation&linux.LOCK_NB != 0
+ operation &^= linux.LOCK_NB
+
+ var blocker lock.Blocker
+ if !nonblocking {
+ blocker = t
+ }
+
+ switch operation {
+ case linux.LOCK_EX:
+ if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_SH:
+ if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+ return 0, nil, err
+ }
+ case linux.LOCK_UN:
+ if err := file.UnlockBSD(t); err != nil {
+ return 0, nil, err
+ }
+ default:
+ // flock(2): EINVAL operation is invalid.
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
index bbe248d17..c4c0f9e0a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/memfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -47,10 +47,11 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
}
shmMount := t.Kernel().ShmMount()
- file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+ file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
if err != nil {
return 0, nil, err
}
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index 60a43f0a0..dc05c2994 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -61,7 +61,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
}
defer func() {
if opts.MappingIdentity != nil {
- opts.MappingIdentity.DecRef()
+ opts.MappingIdentity.DecRef(t)
}
}()
@@ -71,7 +71,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// mmap unconditionally requires that the FD is readable.
if !file.IsReadable() {
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..4bd5c7ca2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,150 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ sourceAddr := args[0].Pointer()
+ targetAddr := args[1].Pointer()
+ typeAddr := args[2].Pointer()
+ flags := args[3].Uint64()
+ dataAddr := args[4].Pointer()
+
+ // For null-terminated strings related to mount(2), Linux copies in at most
+ // a page worth of data. See fs/namespace.c:copy_mount_string().
+ fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ targetPath, err := copyInPath(t, targetAddr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ data := ""
+ if dataAddr != 0 {
+ // In Linux, a full page is always copied in regardless of null
+ // character placement, and the address is passed to each file system.
+ // Most file systems always treat this data as a string, though, and so
+ // do all of the ones we implement.
+ data, err = t.CopyInString(dataAddr, usermem.PageSize)
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+
+ // Ignore magic value that was required before Linux 2.4.
+ if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+ flags = flags &^ linux.MS_MGC_MSK
+ }
+
+ // Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+ // namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+ linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+ linux.MS_UNBINDABLE | linux.MS_MOVE
+
+ // Silently allow MS_NOSUID, since we don't implement set-id bits
+ // anyway.
+ const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+ // Linux just allows passing any flags to mount(2) - it won't fail when
+ // unknown or unsupported flags are passed. Since we don't implement
+ // everything, we fail explicitly on flags that are unimplemented.
+ if flags&(unsupportedOps|unsupportedFlags) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ var opts vfs.MountOptions
+ if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+ opts.Flags.NoATime = true
+ }
+ if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+ opts.Flags.NoExec = true
+ }
+ if flags&linux.MS_NODEV == linux.MS_NODEV {
+ opts.Flags.NoDev = true
+ }
+ if flags&linux.MS_NOSUID == linux.MS_NOSUID {
+ opts.Flags.NoSUID = true
+ }
+ if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+ opts.ReadOnly = true
+ }
+ opts.GetFilesystemOptions.Data = data
+
+ target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer target.Release(t)
+
+ return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Int()
+
+ // Must have CAP_SYS_ADMIN in the mount namespace's associated user
+ // namespace.
+ //
+ // Currently, this is always the init task's user namespace.
+ creds := t.Credentials()
+ if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+ return 0, nil, syserror.EPERM
+ }
+
+ const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+ if flags&unsupported != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release(t)
+
+ opts := vfs.UmountOptions{
+ Flags: uint32(flags),
+ }
+
+ return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
index 97da6c647..90a511d9a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/path.go
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
haveStartRef := false
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
- root.DecRef()
+ root.DecRef(t)
return taskPathOperation{}, syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
@@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
- root.DecRef()
+ root.DecRef(t)
return taskPathOperation{}, syserror.EBADF
}
start = dirfile.VirtualDentry()
start.IncRef()
haveStartRef = true
- dirfile.DecRef()
+ dirfile.DecRef(t)
}
}
return taskPathOperation{
@@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA
}, nil
}
-func (tpop *taskPathOperation) Release() {
- tpop.pop.Root.DecRef()
+func (tpop *taskPathOperation) Release(t *kernel.Task) {
+ tpop.pop.Root.DecRef(t)
if tpop.haveStartRef {
- tpop.pop.Start.DecRef()
+ tpop.pop.Start.DecRef(t)
tpop.haveStartRef = false
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 4a01e4209..9b4848d9e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -42,8 +42,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
return syserror.EINVAL
}
r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
- defer r.DecRef()
- defer w.DecRef()
+ defer r.DecRef(t)
+ defer w.DecRef(t)
fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
@@ -54,7 +54,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
if _, err := t.CopyOut(addr, fds); err != nil {
for _, fd := range fds {
if _, file := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ file.DecRef(t)
}
}
return err
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index ff1b25d7b..7b9d5e18a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
if ch == nil {
- defer file.DecRef()
+ defer file.DecRef(t)
} else {
state.file = file
state.waiter, _ = waiter.NewChannelEntry(ch)
@@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan
}
// releaseState releases all the pollState in "state".
-func releaseState(state []pollState) {
+func releaseState(t *kernel.Task, state []pollState) {
for i := range state {
if state[i].file != nil {
state[i].file.EventUnregister(&state[i].waiter)
- state[i].file.DecRef()
+ state[i].file.DecRef(t)
}
}
}
@@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
// result, we stop registering for events but still go through all files
// to get their ready masks.
state := make([]pollState, len(pfd))
- defer releaseState(state)
+ defer releaseState(t, state)
n := uintptr(0)
for i := range pfd {
initReadiness(t, &pfd[i], &state[i], ch)
@@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
if file == nil {
return 0, syserror.EBADF
}
- file.DecRef()
+ file.DecRef(t)
var mask int16
if (rV & m) != 0 {
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 3a7ef24f5..a905dae0a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the size is legitimate.
si := int(size)
@@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Get the destination of the read.
dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
@@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -142,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -179,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -217,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
n, err := file.PRead(t, dst, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -296,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the size is legitimate.
si := int(size)
@@ -327,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Get the source of the write.
src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
@@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
n, err := file.Write(t, src, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -394,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate and does not overflow.
if offset < 0 || offset+int64(size) < 0 {
@@ -431,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < 0 {
@@ -469,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Check that the offset is legitimate.
if offset < -1 {
@@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, err := file.PWrite(t, src, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return n, err
}
@@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ }
return total, err
}
@@ -565,8 +601,41 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
newoff, err := file.Seek(t, offset, whence)
return uintptr(newoff), nil, err
}
+
+// Readahead implements readahead(2).
+func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ offset := args[1].Int64()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ // Check that the file is readable.
+ if !file.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is valid.
+ if int(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Check that the offset is legitimate and does not overflow.
+ if offset < 0 || offset+int64(size) < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Return EINVAL; if the underlying file type does not support readahead,
+ // then Linux will return EINVAL to indicate as much. In the future, we
+ // may extend this function to actually support readahead hints.
+ return 0, nil, syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 09ecfed26..5e6eb13ba 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -65,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
@@ -150,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
@@ -178,6 +179,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
+ NeedWritePerm: true,
})
return 0, nil, handleSetSizeError(t, err)
}
@@ -195,7 +197,11 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
+
+ if !file.IsWritable() {
+ return 0, nil, syserror.EINVAL
+ }
err := file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
@@ -206,6 +212,56 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, handleSetSizeError(t, err)
}
+// Fallocate implements linux system call fallocate(2).
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ mode := args[1].Uint64()
+ offset := args[2].Int64()
+ length := args[3].Int64()
+
+ file := t.GetFileVFS2(fd)
+
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef(t)
+
+ if !file.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ if mode != 0 {
+ return 0, nil, syserror.ENOTSUP
+ }
+
+ if offset < 0 || length <= 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ size := offset + length
+
+ if size < 0 {
+ return 0, nil, syserror.EFBIG
+ }
+
+ limit := limits.FromContext(t).Get(limits.FileSize).Cur
+
+ if uint64(size) >= limit {
+ t.SendSignal(&arch.SignalInfo{
+ Signo: int32(linux.SIGXFSZ),
+ Code: arch.SignalInfoUser,
+ })
+ return 0, nil, syserror.EFBIG
+ }
+
+ if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil {
+ return 0, nil, err
+ }
+
+ file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ return 0, nil, nil
+}
+
// Utime implements Linux syscall utime(2).
func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
@@ -382,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
@@ -390,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -401,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa
// VirtualFilesystem.SetStatAt(), since the former may be able
// to use opened file state to expedite the SetStat.
err := dirfile.SetStat(t, *opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
return err
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go
index 623992f6f..b89f34cdb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/signal.go
+++ b/pkg/sentry/syscalls/linux/vfs2/signal.go
@@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Is this a signalfd?
if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
@@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Create a new descriptor.
fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index 10b668477..4a68c64f3 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -30,6 +30,8 @@ import (
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/tools/go_marshal/marshal"
+ "gvisor.dev/gvisor/tools/go_marshal/primitive"
)
// minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -194,7 +196,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if e != nil {
return 0, nil, e.ToError()
}
- defer s.DecRef()
+ defer s.DecRef(t)
if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
return 0, nil, err
@@ -228,8 +230,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, e.ToError()
}
// Adding to the FD table will cause an extra reference to be acquired.
- defer s1.DecRef()
- defer s2.DecRef()
+ defer s1.DecRef(t)
+ defer s2.DecRef(t)
nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
@@ -251,7 +253,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if _, err := t.CopyOut(addr, fds); err != nil {
for _, fd := range fds {
if _, file := t.FDTable().Remove(fd); file != nil {
- file.DecRef()
+ file.DecRef(t)
}
}
return 0, nil, err
@@ -271,7 +273,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -302,7 +304,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -361,7 +363,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -388,7 +390,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -417,7 +419,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -448,7 +450,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -477,7 +479,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
if v != nil {
- if _, err := t.CopyOut(optValAddr, v); err != nil {
+ if _, err := v.CopyOut(t, optValAddr); err != nil {
return 0, nil, err
}
}
@@ -487,7 +489,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
-func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) {
if level == linux.SOL_SOCKET {
switch name {
case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -499,13 +501,16 @@ func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr
switch name {
case linux.SO_TYPE:
_, skType, _ := s.Type()
- return int32(skType), nil
+ v := primitive.Int32(skType)
+ return &v, nil
case linux.SO_DOMAIN:
family, _, _ := s.Type()
- return int32(family), nil
+ v := primitive.Int32(family)
+ return &v, nil
case linux.SO_PROTOCOL:
_, _, protocol := s.Type()
- return int32(protocol), nil
+ v := primitive.Int32(protocol)
+ return &v, nil
}
}
@@ -527,7 +532,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -542,7 +547,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return 0, nil, syserror.EINVAL
}
buf := t.CopyScratchBuffer(int(optLen))
- if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+ if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
return 0, nil, err
}
@@ -565,7 +570,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -593,7 +598,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -626,7 +631,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -679,7 +684,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -773,7 +778,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
}
if !cms.Unix.Empty() {
mflags |= linux.MSG_CTRUNC
- cms.Release()
+ cms.Release(t)
}
if int(msg.Flags) != mflags {
@@ -793,7 +798,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
- defer cms.Release()
+ defer cms.Release(t)
controlData := make([]byte, 0, msg.ControlLen)
controlData = control.PackControlMessages(t, cms, controlData)
@@ -849,7 +854,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -878,7 +883,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
}
n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
- cm.Release()
+ cm.Release(t)
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
@@ -922,7 +927,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -960,7 +965,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
@@ -1064,7 +1069,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
if err != nil {
- controlMessages.Release()
+ controlMessages.Release(t)
}
return uintptr(n), err
}
@@ -1082,7 +1087,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags
if file == nil {
return 0, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
// Extract the socket.
s, ok := file.Impl().(socket.SocketVFS2)
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..75bfa2c79
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,490 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ inOffsetPtr := args[1].Pointer()
+ outFD := args[2].Int()
+ outOffsetPtr := args[3].Pointer()
+ count := int64(args[4].SizeT())
+ flags := args[5].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef(t)
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef(t)
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // At least one file description must represent a pipe.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe && !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy in offsets.
+ inOffset := int64(-1)
+ if inOffsetPtr != 0 {
+ if inIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if inFile.Options().DenyPRead {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ if inOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+ outOffset := int64(-1)
+ if outOffsetPtr != 0 {
+ if outIsPipe {
+ return 0, nil, syserror.ESPIPE
+ }
+ if outFile.Options().DenyPWrite {
+ return 0, nil, syserror.EINVAL
+ }
+ if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ if outOffset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ // Move data.
+ var (
+ n int64
+ err error
+ )
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
+ for {
+ // If both input and output are pipes, delegate to the pipe
+ // implementation. Otherwise, exactly one end is a pipe, which
+ // we ensure is consistently ordered after the non-pipe FD's
+ // locks by passing the pipe FD as usermem.IO to the non-pipe
+ // end.
+ switch {
+ case inIsPipe && outIsPipe:
+ n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+ case inIsPipe:
+ if outOffset != -1 {
+ n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+ outOffset += n
+ } else {
+ n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+ }
+ case outIsPipe:
+ if inOffset != -1 {
+ n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+ inOffset += n
+ } else {
+ n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+ }
+ default:
+ panic("not possible")
+ }
+
+ if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+ break
+ }
+ if err = dw.waitForBoth(t); err != nil {
+ break
+ }
+ }
+
+ // Copy updated offsets out.
+ if inOffsetPtr != 0 {
+ if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+ if outOffsetPtr != 0 {
+ if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if n == 0 {
+ return 0, nil, err
+ }
+
+ // On Linux, inotify behavior is not very consistent with splice(2). We try
+ // our best to emulate Linux for very basic calls to splice, where for some
+ // reason, events are generated for output files, but not input files.
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ inFD := args[0].Int()
+ outFD := args[1].Int()
+ count := int64(args[2].SizeT())
+ flags := args[3].Int()
+
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Check for invalid flags.
+ if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get file descriptions.
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef(t)
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef(t)
+
+ // Check that both files support the required directionality.
+ if !inFile.IsReadable() || !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // The operation is non-blocking if anything is non-blocking.
+ //
+ // N.B. This is a rather simplistic heuristic that avoids some
+ // poor edge case behavior since the exact semantics here are
+ // underspecified and vary between versions of Linux itself.
+ nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+ // Both file descriptions must represent pipes.
+ inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ if !inIsPipe || !outIsPipe {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy data.
+ var (
+ n int64
+ err error
+ )
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
+ for {
+ n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
+ if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+ break
+ }
+ if err = dw.waitForBoth(t); err != nil {
+ break
+ }
+ }
+ if n == 0 {
+ return 0, nil, err
+ }
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ return uintptr(n), nil, nil
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ outFD := args[0].Int()
+ inFD := args[1].Int()
+ offsetAddr := args[2].Pointer()
+ count := int64(args[3].SizeT())
+
+ inFile := t.GetFileVFS2(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef(t)
+ if !inFile.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ outFile := t.GetFileVFS2(outFD)
+ if outFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer outFile.DecRef(t)
+ if !outFile.IsWritable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Verify that the outFile Append flag is not set.
+ if outFile.StatusFlags()&linux.O_APPEND != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Verify that inFile is a regular file or block device. This is a
+ // requirement; the same check appears in Linux
+ // (fs/splice.c:splice_direct_to_actor).
+ if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
+ return 0, nil, err
+ } else if stat.Mask&linux.STATX_TYPE == 0 ||
+ (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Copy offset if it exists.
+ offset := int64(-1)
+ if offsetAddr != 0 {
+ if inFile.Options().DenyPRead {
+ return 0, nil, syserror.ESPIPE
+ }
+ if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+ return 0, nil, err
+ }
+ if offset < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if offset+count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ }
+
+ // Validate count. This must come after offset checks.
+ if count < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ if count == 0 {
+ return 0, nil, nil
+ }
+ if count > int64(kernel.MAX_RW_COUNT) {
+ count = int64(kernel.MAX_RW_COUNT)
+ }
+
+ // Copy data.
+ var (
+ n int64
+ err error
+ )
+ dw := dualWaiter{
+ inFile: inFile,
+ outFile: outFile,
+ }
+ defer dw.destroy()
+ outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+ // Reading from input file should never block, since it is regular or
+ // block device. We only need to check if writing to the output file
+ // can block.
+ nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
+ if outIsPipe {
+ for n < count {
+ var spliceN int64
+ if offset != -1 {
+ spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{})
+ offset += spliceN
+ } else {
+ spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+ }
+ if spliceN == 0 && err == io.EOF {
+ // We reached the end of the file. Eat the error and exit the loop.
+ err = nil
+ break
+ }
+ n += spliceN
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForBoth(t)
+ }
+ if err != nil {
+ break
+ }
+ }
+ } else {
+ // Read inFile to buffer, then write the contents to outFile.
+ buf := make([]byte, count)
+ for n < count {
+ var readN int64
+ if offset != -1 {
+ readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
+ offset += readN
+ } else {
+ readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ }
+ if readN == 0 && err == io.EOF {
+ // We reached the end of the file. Eat the error and exit the loop.
+ err = nil
+ break
+ }
+ n += readN
+ if err != nil {
+ break
+ }
+
+ // Write all of the bytes that we read. This may need
+ // multiple write calls to complete.
+ wbuf := buf[:n]
+ for len(wbuf) > 0 {
+ var writeN int64
+ writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
+ wbuf = wbuf[writeN:]
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForOut(t)
+ }
+ if err != nil {
+ // We didn't complete the write. Only
+ // report the bytes that were actually
+ // written, and rewind the offset.
+ notWritten := int64(len(wbuf))
+ n -= notWritten
+ if offset != -1 {
+ offset -= notWritten
+ }
+ break
+ }
+ }
+ if err == syserror.ErrWouldBlock && !nonBlock {
+ err = dw.waitForBoth(t)
+ }
+ if err != nil {
+ break
+ }
+ }
+ }
+
+ if offsetAddr != 0 {
+ // Copy out the new offset.
+ if _, err := t.CopyOut(offsetAddr, offset); err != nil {
+ return 0, nil, err
+ }
+ }
+
+ if n == 0 {
+ return 0, nil, err
+ }
+
+ inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+ outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+ return uintptr(n), nil, nil
+}
+
+// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
+// thread-safe, and does not take a reference on the vfs.FileDescriptions.
+//
+// Users must call destroy() when finished.
+type dualWaiter struct {
+ inFile *vfs.FileDescription
+ outFile *vfs.FileDescription
+
+ inW waiter.Entry
+ inCh chan struct{}
+ outW waiter.Entry
+ outCh chan struct{}
+}
+
+// waitForBoth waits for both dw.inFile and dw.outFile to be ready.
+func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
+ if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+ if dw.inCh == nil {
+ dw.inW, dw.inCh = waiter.NewChannelEntry(nil)
+ dw.inFile.EventRegister(&dw.inW, eventMaskRead)
+ // We might be ready now. Try again before blocking.
+ return nil
+ }
+ if err := t.Block(dw.inCh); err != nil {
+ return err
+ }
+ }
+ return dw.waitForOut(t)
+}
+
+// waitForOut waits for dw.outfile to be read.
+func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
+ if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+ if dw.outCh == nil {
+ dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
+ dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
+ // We might be ready now. Try again before blocking.
+ return nil
+ }
+ if err := t.Block(dw.outCh); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// destroy cleans up resources help by dw. No more calls to wait* can occur
+// after destroy is called.
+func (dw *dualWaiter) destroy() {
+ if dw.inCh != nil {
+ dw.inFile.EventUnregister(&dw.inW)
+ dw.inCh = nil
+ }
+ if dw.outCh != nil {
+ dw.outFile.EventUnregister(&dw.outW)
+ dw.outCh = nil
+ }
+ dw.inFile = nil
+ dw.outFile = nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index bb1d5cac4..0f5d5189c 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
@@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
if err != nil {
return err
}
@@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
@@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
statx, err := file.Stat(t, vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
@@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
root := t.FSContext().RootDirectoryVFS2()
- defer root.DecRef()
+ defer root.DecRef(t)
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
@@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
- defer start.DecRef()
+ defer start.DecRef(t)
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
@@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
- dirfile.DecRef()
+ dirfile.DecRef(t)
if err != nil {
return 0, nil, err
}
@@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
start = dirfile.VirtualDentry()
start.IncRef()
- defer start.DecRef()
- dirfile.DecRef()
+ defer start.DecRef(t)
+ dirfile.DecRef(t)
}
}
@@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
// access(2) and faccessat(2) check permissions using real
// UID/GID, not effective UID/GID.
@@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
if err != nil {
@@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
@@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
index 365250b0b..a6491ac37 100644
--- a/pkg/sentry/syscalls/linux/vfs2/sync.go
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.SyncFS(t)
}
@@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
return 0, nil, file.Sync(t)
}
@@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
nbytes := args[2].Int64()
flags := args[3].Uint()
- if offset < 0 {
- return 0, nil, syserror.EINVAL
- }
- if nbytes < 0 {
+ // Check for negative values and overflow.
+ if offset < 0 || offset+nbytes < 0 {
return 0, nil, syserror.EINVAL
}
if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
@@ -79,9 +77,39 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
- // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
- // [offset, offset+nbytes).
- return 0, nil, file.Sync(t)
+ // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
+ // is a full-file sync, i.e. fsync(2). As a result, there are severe
+ // limitations on how much we support sync_file_range:
+ // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
+ // if the file size is changed. We do.
+ // - We always sync the entire file instead of [offset, offset+nbytes).
+ // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
+ // correctness, we would have to perform a write-out every time WAIT_BEFORE
+ // was used, but this would be much more expensive than expected if there
+ // were no write-out operations in progress.
+ // - Whenever WAIT_AFTER is used, we sync the file.
+ // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
+ // be synced anyway. If this flag is used without WAIT_AFTER, then it is
+ // safe (and less expensive) to do nothing, because the syscall will not
+ // wait for the write-out to complete--we only need to make sure that the
+ // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
+ // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
+ // errors or ENOSPC conditions and will return those to the caller, after
+ // clearing the EIO and ENOSPC flags in the address_space." We don't do
+ // this.
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+ flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+ if err := file.Sync(t); err != nil {
+ return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+ }
+ }
+ return 0, nil, nil
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
index 5ac79bc09..7a26890ef 100644
--- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.EINVAL
}
vfsObj := t.Kernel().VFS()
- file, err := timerfd.New(vfsObj, clock, fileFlags)
+ file, err := timerfd.New(t, vfsObj, clock, fileFlags)
if err != nil {
return 0, nil, err
}
- defer file.DecRef()
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
})
@@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
if !ok {
@@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
if !ok {
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index a332d01bd..c576d9475 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -44,7 +44,7 @@ func Override() {
s.Table[23] = syscalls.Supported("select", Select)
s.Table[32] = syscalls.Supported("dup", Dup)
s.Table[33] = syscalls.Supported("dup2", Dup2)
- delete(s.Table, 40) // sendfile
+ s.Table[40] = syscalls.Supported("sendfile", Sendfile)
s.Table[41] = syscalls.Supported("socket", Socket)
s.Table[42] = syscalls.Supported("connect", Connect)
s.Table[43] = syscalls.Supported("accept", Accept)
@@ -62,7 +62,7 @@ func Override() {
s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
s.Table[59] = syscalls.Supported("execve", Execve)
s.Table[72] = syscalls.Supported("fcntl", Fcntl)
- delete(s.Table, 73) // flock
+ s.Table[73] = syscalls.Supported("flock", Flock)
s.Table[74] = syscalls.Supported("fsync", Fsync)
s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
s.Table[76] = syscalls.Supported("truncate", Truncate)
@@ -90,9 +90,9 @@ func Override() {
s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
s.Table[161] = syscalls.Supported("chroot", Chroot)
s.Table[162] = syscalls.Supported("sync", Sync)
- delete(s.Table, 165) // mount
- delete(s.Table, 166) // umount2
- delete(s.Table, 187) // readahead
+ s.Table[165] = syscalls.Supported("mount", Mount)
+ s.Table[166] = syscalls.Supported("umount2", Umount2)
+ s.Table[187] = syscalls.Supported("readahead", Readahead)
s.Table[188] = syscalls.Supported("setxattr", Setxattr)
s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
@@ -105,20 +105,16 @@ func Override() {
s.Table[197] = syscalls.Supported("removexattr", Removexattr)
s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
- delete(s.Table, 206) // io_setup
- delete(s.Table, 207) // io_destroy
- delete(s.Table, 208) // io_getevents
- delete(s.Table, 209) // io_submit
- delete(s.Table, 210) // io_cancel
+ s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
s.Table[217] = syscalls.Supported("getdents64", Getdents64)
- delete(s.Table, 221) // fdavise64
+ s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil)
s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
s.Table[235] = syscalls.Supported("utimes", Utimes)
- delete(s.Table, 253) // inotify_init
- delete(s.Table, 254) // inotify_add_watch
- delete(s.Table, 255) // inotify_rm_watch
+ s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+ s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
s.Table[257] = syscalls.Supported("openat", Openat)
s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
s.Table[259] = syscalls.Supported("mknodat", Mknodat)
@@ -134,15 +130,15 @@ func Override() {
s.Table[269] = syscalls.Supported("faccessat", Faccessat)
s.Table[270] = syscalls.Supported("pselect", Pselect)
s.Table[271] = syscalls.Supported("ppoll", Ppoll)
- delete(s.Table, 275) // splice
- delete(s.Table, 276) // tee
+ s.Table[275] = syscalls.Supported("splice", Splice)
+ s.Table[276] = syscalls.Supported("tee", Tee)
s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
s.Table[280] = syscalls.Supported("utimensat", Utimensat)
s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
s.Table[282] = syscalls.Supported("signalfd", Signalfd)
s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
s.Table[284] = syscalls.Supported("eventfd", Eventfd)
- delete(s.Table, 285) // fallocate
+ s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
s.Table[288] = syscalls.Supported("accept4", Accept4)
@@ -151,7 +147,7 @@ func Override() {
s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
s.Table[292] = syscalls.Supported("dup3", Dup3)
s.Table[293] = syscalls.Supported("pipe2", Pipe2)
- delete(s.Table, 294) // inotify_init1
+ s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
s.Table[295] = syscalls.Supported("preadv", Preadv)
s.Table[296] = syscalls.Supported("pwritev", Pwritev)
s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
@@ -167,6 +163,106 @@ func Override() {
// Override ARM64.
s = linux.ARM64
+ s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+ s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
+ s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
+ s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+ s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
+ s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
+ s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+ s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
+ s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
+ s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+ s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
+ s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
+ s.Table[17] = syscalls.Supported("getcwd", Getcwd)
+ s.Table[19] = syscalls.Supported("eventfd2", Eventfd2)
+ s.Table[20] = syscalls.Supported("epoll_create1", EpollCreate1)
+ s.Table[21] = syscalls.Supported("epoll_ctl", EpollCtl)
+ s.Table[22] = syscalls.Supported("epoll_pwait", EpollPwait)
+ s.Table[23] = syscalls.Supported("dup", Dup)
+ s.Table[24] = syscalls.Supported("dup3", Dup3)
+ s.Table[25] = syscalls.Supported("fcntl", Fcntl)
+ s.Table[26] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+ s.Table[27] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[28] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[29] = syscalls.Supported("ioctl", Ioctl)
+ s.Table[32] = syscalls.Supported("flock", Flock)
+ s.Table[33] = syscalls.Supported("mknodat", Mknodat)
+ s.Table[34] = syscalls.Supported("mkdirat", Mkdirat)
+ s.Table[35] = syscalls.Supported("unlinkat", Unlinkat)
+ s.Table[36] = syscalls.Supported("symlinkat", Symlinkat)
+ s.Table[37] = syscalls.Supported("linkat", Linkat)
+ s.Table[38] = syscalls.Supported("renameat", Renameat)
+ s.Table[39] = syscalls.Supported("umount2", Umount2)
+ s.Table[40] = syscalls.Supported("mount", Mount)
+ s.Table[43] = syscalls.Supported("statfs", Statfs)
+ s.Table[44] = syscalls.Supported("fstatfs", Fstatfs)
+ s.Table[45] = syscalls.Supported("truncate", Truncate)
+ s.Table[46] = syscalls.Supported("ftruncate", Ftruncate)
+ s.Table[48] = syscalls.Supported("faccessat", Faccessat)
+ s.Table[49] = syscalls.Supported("chdir", Chdir)
+ s.Table[50] = syscalls.Supported("fchdir", Fchdir)
+ s.Table[51] = syscalls.Supported("chroot", Chroot)
+ s.Table[52] = syscalls.Supported("fchmod", Fchmod)
+ s.Table[53] = syscalls.Supported("fchmodat", Fchmodat)
+ s.Table[54] = syscalls.Supported("fchownat", Fchownat)
+ s.Table[55] = syscalls.Supported("fchown", Fchown)
+ s.Table[56] = syscalls.Supported("openat", Openat)
+ s.Table[57] = syscalls.Supported("close", Close)
+ s.Table[59] = syscalls.Supported("pipe2", Pipe2)
+ s.Table[61] = syscalls.Supported("getdents64", Getdents64)
+ s.Table[62] = syscalls.Supported("lseek", Lseek)
s.Table[63] = syscalls.Supported("read", Read)
+ s.Table[64] = syscalls.Supported("write", Write)
+ s.Table[65] = syscalls.Supported("readv", Readv)
+ s.Table[66] = syscalls.Supported("writev", Writev)
+ s.Table[67] = syscalls.Supported("pread64", Pread64)
+ s.Table[68] = syscalls.Supported("pwrite64", Pwrite64)
+ s.Table[69] = syscalls.Supported("preadv", Preadv)
+ s.Table[70] = syscalls.Supported("pwritev", Pwritev)
+ s.Table[72] = syscalls.Supported("pselect", Pselect)
+ s.Table[73] = syscalls.Supported("ppoll", Ppoll)
+ s.Table[74] = syscalls.Supported("signalfd4", Signalfd4)
+ s.Table[76] = syscalls.Supported("splice", Splice)
+ s.Table[77] = syscalls.Supported("tee", Tee)
+ s.Table[78] = syscalls.Supported("readlinkat", Readlinkat)
+ s.Table[80] = syscalls.Supported("fstat", Fstat)
+ s.Table[81] = syscalls.Supported("sync", Sync)
+ s.Table[82] = syscalls.Supported("fsync", Fsync)
+ s.Table[83] = syscalls.Supported("fdatasync", Fdatasync)
+ s.Table[84] = syscalls.Supported("sync_file_range", SyncFileRange)
+ s.Table[85] = syscalls.Supported("timerfd_create", TimerfdCreate)
+ s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+ s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+ s.Table[88] = syscalls.Supported("utimensat", Utimensat)
+ s.Table[198] = syscalls.Supported("socket", Socket)
+ s.Table[199] = syscalls.Supported("socketpair", SocketPair)
+ s.Table[200] = syscalls.Supported("bind", Bind)
+ s.Table[201] = syscalls.Supported("listen", Listen)
+ s.Table[202] = syscalls.Supported("accept", Accept)
+ s.Table[203] = syscalls.Supported("connect", Connect)
+ s.Table[204] = syscalls.Supported("getsockname", GetSockName)
+ s.Table[205] = syscalls.Supported("getpeername", GetPeerName)
+ s.Table[206] = syscalls.Supported("sendto", SendTo)
+ s.Table[207] = syscalls.Supported("recvfrom", RecvFrom)
+ s.Table[208] = syscalls.Supported("setsockopt", SetSockOpt)
+ s.Table[209] = syscalls.Supported("getsockopt", GetSockOpt)
+ s.Table[210] = syscalls.Supported("shutdown", Shutdown)
+ s.Table[211] = syscalls.Supported("sendmsg", SendMsg)
+ s.Table[212] = syscalls.Supported("recvmsg", RecvMsg)
+ s.Table[221] = syscalls.Supported("execve", Execve)
+ s.Table[222] = syscalls.Supported("mmap", Mmap)
+ s.Table[242] = syscalls.Supported("accept4", Accept4)
+ s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg)
+ s.Table[267] = syscalls.Supported("syncfs", Syncfs)
+ s.Table[269] = syscalls.Supported("sendmmsg", SendMMsg)
+ s.Table[276] = syscalls.Supported("renameat2", Renameat2)
+ s.Table[279] = syscalls.Supported("memfd_create", MemfdCreate)
+ s.Table[281] = syscalls.Supported("execveat", Execveat)
+ s.Table[286] = syscalls.Supported("preadv2", Preadv2)
+ s.Table[287] = syscalls.Supported("pwritev2", Pwritev2)
+ s.Table[291] = syscalls.Supported("statx", Statx)
+
s.Init()
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index af455d5c1..ef99246ed 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -49,7 +49,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
if err != nil {
@@ -72,7 +72,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
names, err := file.Listxattr(t, uint64(size))
if err != nil {
@@ -109,7 +109,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
if err != nil {
return 0, nil, err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -141,7 +141,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -262,7 +262,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
if err != nil {
return err
}
- defer tpop.Release()
+ defer tpop.Release(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
@@ -281,7 +281,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
if file == nil {
return 0, nil, syserror.EBADF
}
- defer file.DecRef()
+ defer file.DecRef(t)
name, err := copyInXattrName(t, nameAddr)
if err != nil {
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5ad57a8a3..8afc62d53 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -12,12 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include "funcdata.h"
#include "textflag.h"
// Documentation is available in parameters.go.
//
// func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
TEXT ·muldiv64(SB),NOSPLIT,$40-33
+ GO_ARGS
+ NO_LOCAL_POINTERS
MOVD value+0(FP), R0
MOVD multiplier+8(FP), R1
MOVD divisor+16(FP), R2
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 65868cb26..cd1b95117 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -228,11 +228,15 @@ func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Par
//
// The log level is determined by the error severity.
func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) {
- fn := log.Debugf
- if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() {
+ magNS := int64(errorNS.Magnitude())
+ if magNS <= 10*time.Microsecond.Nanoseconds() {
+ // Don't log small errors.
+ return
+ }
+ fn := log.Infof
+ if magNS > time.Millisecond.Nanoseconds() {
+ // Upgrade large errors to warning.
fn = log.Warningf
- } else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() {
- fn = log.Infof
}
fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency)
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index e1b9084ac..0ce1257f6 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -484,3 +484,18 @@ func TestMulDivOverflow(t *testing.T) {
})
}
}
+
+func BenchmarkMuldiv64(b *testing.B) {
+ var v uint64 = math.MaxUint64
+ for i := uint64(1); i <= 1000000; i++ {
+ mult := uint64(1000000000)
+ div := i * mult
+ res, ok := muldiv64(v, mult, div)
+ if !ok {
+ b.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div)
+ }
+ if want := v / i; res != want {
+ b.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want)
+ }
+ }
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 94d69c1cc..642769e7c 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "event_list",
+ out = "event_list.go",
+ package = "vfs",
+ prefix = "event",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Event",
+ "Linker": "*Event",
+ },
+)
+
go_library(
name = "vfs",
srcs = [
@@ -25,11 +37,14 @@ go_library(
"device.go",
"epoll.go",
"epoll_interest_list.go",
+ "event_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
"filesystem_impl_util.go",
"filesystem_type.go",
+ "inotify.go",
+ "lock.go",
"mount.go",
"mount_unsafe.go",
"options.go",
@@ -57,6 +72,7 @@ go_library(
"//pkg/sentry/limits",
"//pkg/sentry/memmap",
"//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/uniqueid",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..4b9faf2ea 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
- Mount: Each referenced Mount holds a reference on its parent, which is the
mount containing its mount point.
-- VirtualFilesystem: A reference is held on each Mount that has not been
- umounted.
+- VirtualFilesystem: A reference is held on each Mount that has been connected
+ to a mount point, but not yet umounted.
MountNamespace and FileDescription references are held by users of VFS. The
expectation is that each `kernel.Task` holds a reference on its corresponding
@@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's
- binder, which is similarly far too incomplete to use.
- - whitelistfs, which we are already actively attempting to remove.
-
- Save/restore. For instance, it is unclear if the current implementation of
the `state` package supports the inheritance pattern described above.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index caf770fd5..5a0e3e6b5 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -82,7 +82,7 @@ type anonDentry struct {
}
// Release implements FilesystemImpl.Release.
-func (fs *anonFilesystem) Release() {
+func (fs *anonFilesystem) Release(ctx context.Context) {
}
// Sync implements FilesystemImpl.Sync.
@@ -294,6 +294,21 @@ func (d *anonDentry) TryIncRef() bool {
}
// DecRef implements DentryImpl.DecRef.
-func (d *anonDentry) DecRef() {
+func (d *anonDentry) DecRef(ctx context.Context) {
// no-op
}
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+func (d *anonDentry) Watches() *Watches {
+ return nil
+}
+
+// OnZeroWatches implements Dentry.OnZeroWatches.
+func (d *anonDentry) OnZeroWatches(context.Context) {}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 8624dbd5d..bc7ea93ea 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -17,6 +17,7 @@ package vfs
import (
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -102,7 +103,40 @@ type DentryImpl interface {
TryIncRef() bool
// DecRef decrements the Dentry's reference count.
- DecRef()
+ DecRef(ctx context.Context)
+
+ // InotifyWithParent notifies all watches on the targets represented by this
+ // dentry and its parent. The parent's watches are notified first, followed
+ // by this dentry's.
+ //
+ // InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+ // representing directories.
+ //
+ // Note that the events may not actually propagate up to the user, depending
+ // on the event masks.
+ InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType)
+
+ // Watches returns the set of inotify watches for the file corresponding to
+ // the Dentry. Dentries that are hard links to the same underlying file
+ // share the same watches.
+ //
+ // Watches may return nil if the dentry belongs to a FilesystemImpl that
+ // does not support inotify. If an implementation returns a non-nil watch
+ // set, it must always return a non-nil watch set. Likewise, if an
+ // implementation returns a nil watch set, it must always return a nil watch
+ // set.
+ //
+ // The caller does not need to hold a reference on the dentry.
+ Watches() *Watches
+
+ // OnZeroWatches is called whenever the number of watches on a dentry drops
+ // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage
+ // dentry lifetime.
+ //
+ // The caller does not need to hold a reference on the dentry. OnZeroWatches
+ // may acquire inotify locks, so to prevent deadlock, no inotify locks should
+ // be held by the caller.
+ OnZeroWatches(ctx context.Context)
}
// IncRef increments d's reference count.
@@ -117,8 +151,8 @@ func (d *Dentry) TryIncRef() bool {
}
// DecRef decrements d's reference count.
-func (d *Dentry) DecRef() {
- d.impl.DecRef()
+func (d *Dentry) DecRef(ctx context.Context) {
+ d.impl.DecRef(ctx)
}
// IsDead returns true if d has been deleted or invalidated by its owning
@@ -133,6 +167,26 @@ func (d *Dentry) isMounted() bool {
return atomic.LoadUint32(&d.mounts) != 0
}
+// InotifyWithParent notifies all watches on the targets represented by d and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {
+ d.impl.InotifyWithParent(ctx, events, cookie, et)
+}
+
+// Watches returns the set of inotify watches associated with d.
+//
+// Watches will return nil if d belongs to a FilesystemImpl that does not
+// support inotify.
+func (d *Dentry) Watches() *Watches {
+ return d.impl.Watches()
+}
+
+// OnZeroWatches performs cleanup tasks whenever the number of watches on a
+// dentry drops to zero.
+func (d *Dentry) OnZeroWatches(ctx context.Context) {
+ d.impl.OnZeroWatches(ctx)
+}
+
// The following functions are exported so that filesystem implementations can
// use them. The vfs package, and users of VFS, should not call these
// functions.
@@ -161,11 +215,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
// succeeds.
-func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
+func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) {
d.dead = true
d.mu.Unlock()
if d.isMounted() {
- vfs.forgetDeadMountpoint(d)
+ vfs.forgetDeadMountpoint(ctx, d)
}
}
@@ -173,12 +227,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
// did for reasons outside of VFS' control (e.g. d represents the local state
// of a file on a remote filesystem on which the file has already been
// deleted).
-func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
+func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
d.mu.Lock()
d.dead = true
d.mu.Unlock()
if d.isMounted() {
- vfs.forgetDeadMountpoint(d)
+ vfs.forgetDeadMountpoint(ctx, d)
}
}
@@ -225,13 +279,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
// that was replaced by from.
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) {
from.mu.Unlock()
if to != nil {
to.dead = true
to.mu.Unlock()
if to.isMounted() {
- vfs.forgetDeadMountpoint(to)
+ vfs.forgetDeadMountpoint(ctx, to)
}
}
}
@@ -250,7 +304,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
//
// forgetDeadMountpoint is analogous to Linux's
// fs/namespace.c:__detach_mounts().
-func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) {
var (
vdsToDecRef []VirtualDentry
mountsToDecRef []*Mount
@@ -263,9 +317,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 8297f964b..1b5af9f73 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -31,6 +31,7 @@ type EpollInstance struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
DentryMetadataFileDescriptionImpl
+ NoLockFD
// q holds waiters on this EpollInstance.
q waiter.Queue
@@ -92,9 +93,9 @@ type epollInterest struct {
// NewEpollInstanceFD returns a FileDescription representing a new epoll
// instance. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) {
vd := vfs.NewAnonVirtualDentry("[eventpoll]")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
ep := &EpollInstance{
interest: make(map[epollInterestKey]*epollInterest),
}
@@ -109,7 +110,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
}
// Release implements FileDescriptionImpl.Release.
-func (ep *EpollInstance) Release() {
+func (ep *EpollInstance) Release(ctx context.Context) {
// Unregister all polled fds.
ep.interestMu.Lock()
defer ep.interestMu.Unlock()
@@ -185,7 +186,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
}
// Register interest in file.
- mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
epi := &epollInterest{
epoll: ep,
key: key,
@@ -256,7 +257,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event
}
// Update epi for the next call to ep.ReadEvents().
- mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
ep.mu.Lock()
epi.mask = mask
epi.userData = event.Data
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index cfabd936c..dcafffe57 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -42,11 +42,20 @@ type FileDescription struct {
// operations.
refs int64
+ // flagsMu protects statusFlags and asyncHandler below.
+ flagsMu sync.Mutex
+
// statusFlags contains status flags, "initialized by open(2) and possibly
- // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
- // memory operations.
+ // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
+ // memory operations when it does not need to be synchronized with an
+ // access to asyncHandler.
statusFlags uint32
+ // asyncHandler handles O_ASYNC signal generation. It is set with the
+ // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
+ // also be set by fcntl(2).
+ asyncHandler FileAsync
+
// epolls is the set of epollInterests registered for this FileDescription.
// epolls is protected by epollMu.
epollMu sync.Mutex
@@ -73,6 +82,8 @@ type FileDescription struct {
// writable is analogous to Linux's FMODE_WRITE.
writable bool
+ usedLockBSD uint32
+
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
@@ -80,8 +91,7 @@ type FileDescription struct {
// FileDescriptionOptions contains options to FileDescription.Init().
type FileDescriptionOptions struct {
- // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
- // usually only the case if O_DIRECT would actually have an effect.
+ // If AllowDirectIO is true, allow O_DIRECT to be set on the file.
AllowDirectIO bool
// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
@@ -106,6 +116,10 @@ type FileDescriptionOptions struct {
UseDentryMetadata bool
}
+// FileCreationFlags are the set of flags passed to FileDescription.Init() but
+// omitted from FileDescription.StatusFlags().
+const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
+
// Init must be called before first use of fd. If it succeeds, it takes
// references on mnt and d. flags is the initial file description flags, which
// is usually the full set of flags passed to open(2).
@@ -120,8 +134,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
fd.refs = 1
// Remove "file creation flags" to mirror the behavior from file.f_flags in
- // fs/open.c:do_dentry_open
- fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
+ // fs/open.c:do_dentry_open.
+ fd.statusFlags = flags &^ FileCreationFlags
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
@@ -157,7 +171,7 @@ func (fd *FileDescription) TryIncRef() bool {
}
// DecRef decrements fd's reference count.
-func (fd *FileDescription) DecRef() {
+func (fd *FileDescription) DecRef(ctx context.Context) {
if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
// Unregister fd from all epoll instances.
fd.epollMu.Lock()
@@ -175,12 +189,25 @@ func (fd *FileDescription) DecRef() {
}
ep.interestMu.Unlock()
}
+
+ // If BSD locks were used, release any lock that it may have acquired.
+ if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
+ fd.impl.UnlockBSD(context.Background(), fd)
+ }
+
// Release implementation resources.
- fd.impl.Release()
+ fd.impl.Release(ctx)
if fd.writable {
fd.vd.mount.EndWrite()
}
- fd.vd.DecRef()
+ fd.vd.DecRef(ctx)
+ fd.flagsMu.Lock()
+ // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1.
+ if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+ fd.asyncHandler.Unregister(fd)
+ }
+ fd.asyncHandler = nil
+ fd.flagsMu.Unlock()
} else if refs < 0 {
panic("FileDescription.DecRef() called without holding a reference")
}
@@ -210,6 +237,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
return fd.vd
}
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+ return fd.opts
+}
+
// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
func (fd *FileDescription) StatusFlags() uint32 {
return atomic.LoadUint32(&fd.statusFlags)
@@ -257,9 +289,20 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
return syserror.EINVAL
}
- // TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+ // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()?
const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+ fd.flagsMu.Lock()
+ if fd.asyncHandler != nil {
+ // Use fd.statusFlags instead of oldFlags, which may have become outdated,
+ // to avoid double registering/unregistering.
+ if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
+ fd.asyncHandler.Register(fd)
+ } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
+ fd.asyncHandler.Unregister(fd)
+ }
+ }
atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+ fd.flagsMu.Unlock()
return nil
}
@@ -292,7 +335,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
type FileDescriptionImpl interface {
// Release is called when the associated FileDescription reaches zero
// references.
- Release()
+ Release(ctx context.Context)
// OnClose is called when a file descriptor representing the
// FileDescription is closed. Note that returning a non-nil error does not
@@ -311,6 +354,12 @@ type FileDescriptionImpl interface {
// represented by the FileDescription.
StatFS(ctx context.Context) (linux.Statfs, error)
+ // Allocate grows the file to offset + length bytes.
+ // Only mode == 0 is supported currently.
+ //
+ // Preconditions: The FileDescription was opened for writing.
+ Allocate(ctx context.Context, mode, offset, length uint64) error
+
// waiter.Waitable methods may be used to poll for I/O events.
waiter.Waitable
@@ -415,24 +464,16 @@ type FileDescriptionImpl interface {
Removexattr(ctx context.Context, name string) error
// LockBSD tries to acquire a BSD-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): BSD-style file locking
LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
- // LockBSD releases a BSD-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ // UnlockBSD releases a BSD-style advisory file lock.
UnlockBSD(ctx context.Context, uid lock.UniqueID) error
// LockPOSIX tries to acquire a POSIX-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): POSIX-style file locking
- LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+ LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error
// UnlockPOSIX releases a POSIX-style advisory file lock.
- //
- // TODO(gvisor.dev/issue/1480): POSIX-style file locking
- UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
+ UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error
}
// Dirent holds the information contained in struct linux_dirent64.
@@ -462,6 +503,15 @@ type IterDirentsCallback interface {
Handle(dirent Dirent) error
}
+// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
+// the semantics of IterDirentsCallback.Handle.
+type IterDirentsCallbackFunc func(dirent Dirent) error
+
+// Handle implements IterDirentsCallback.Handle.
+func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
+ return f(dirent)
+}
+
// OnClose is called when a file descriptor representing the FileDescription is
// closed. Returning a non-nil error should not prevent the file descriptor
// from being closed.
@@ -478,7 +528,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St
Start: fd.vd,
})
stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return stat, err
}
return fd.impl.Stat(ctx, opts)
@@ -493,7 +543,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
return fd.impl.SetStat(ctx, opts)
@@ -509,23 +559,37 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
Start: fd.vd,
})
statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return statfs, err
}
return fd.impl.StatFS(ctx)
}
-// Readiness returns fd's I/O readiness.
+// Allocate grows file represented by FileDescription to offset + length bytes.
+func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if !fd.IsWritable() {
+ return syserror.EBADF
+ }
+ return fd.impl.Allocate(ctx, mode, offset, length)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// It returns fd's I/O readiness.
func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
return fd.impl.Readiness(mask)
}
-// EventRegister registers e for I/O readiness events in mask.
+// EventRegister implements waiter.Waitable.EventRegister.
+//
+// It registers e for I/O readiness events in mask.
func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.impl.EventRegister(e, mask)
}
-// EventUnregister unregisters e for I/O readiness events.
+// EventUnregister implements waiter.Waitable.EventUnregister.
+//
+// It unregisters e for I/O readiness events.
func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
fd.impl.EventUnregister(e)
}
@@ -615,7 +679,7 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
Start: fd.vd,
})
names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return names, err
}
names, err := fd.impl.Listxattr(ctx, size)
@@ -644,7 +708,7 @@ func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions)
Start: fd.vd,
})
val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return val, err
}
return fd.impl.Getxattr(ctx, *opts)
@@ -660,7 +724,7 @@ func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions)
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
return fd.impl.Setxattr(ctx, *opts)
@@ -676,7 +740,7 @@ func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
- vfsObj.putResolvingPath(rp)
+ vfsObj.putResolvingPath(ctx, rp)
return err
}
return fd.impl.Removexattr(ctx, name)
@@ -693,7 +757,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
vfsroot := RootFromContext(ctx)
s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
if vfsroot.Ok() {
- vfsroot.DecRef()
+ vfsroot.DecRef(ctx)
}
return s
}
@@ -731,3 +795,53 @@ func (fd *FileDescription) InodeID() uint64 {
func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
return fd.Sync(ctx)
}
+
+// LockBSD tries to acquire a BSD-style advisory file lock.
+func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
+ atomic.StoreUint32(&fd.usedLockBSD, 1)
+ return fd.impl.LockBSD(ctx, fd, lockType, blocker)
+}
+
+// UnlockBSD releases a BSD-style advisory file lock.
+func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
+ return fd.impl.UnlockBSD(ctx, fd)
+}
+
+// LockPOSIX locks a POSIX-style file range lock.
+func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error {
+ return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block)
+}
+
+// UnlockPOSIX unlocks a POSIX-style file range lock.
+func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error {
+ return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence)
+}
+
+// A FileAsync sends signals to its owner when w is ready for IO. This is only
+// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
+// interface to avoid circular dependencies.
+type FileAsync interface {
+ Register(w waiter.Waitable)
+ Unregister(w waiter.Waitable)
+}
+
+// AsyncHandler returns the FileAsync for fd.
+func (fd *FileDescription) AsyncHandler() FileAsync {
+ fd.flagsMu.Lock()
+ defer fd.flagsMu.Unlock()
+ return fd.asyncHandler
+}
+
+// SetAsyncHandler sets fd.asyncHandler if it has not been set before and
+// returns it.
+func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
+ fd.flagsMu.Lock()
+ defer fd.flagsMu.Unlock()
+ if fd.asyncHandler == nil {
+ fd.asyncHandler = newHandler()
+ if fd.statusFlags&linux.O_ASYNC != 0 {
+ fd.asyncHandler.Register(fd)
+ }
+ }
+ return fd.asyncHandler
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index f4c111926..6b8b4ad49 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,7 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
return linux.Statfs{}, syserror.ENOSYS
}
+// Allocate implements FileDescriptionImpl.Allocate analogously to
+// fallocate called on regular file, directory or FIFO in Linux.
+func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.ENODEV
+}
+
// Readiness implements waiter.Waitable.Readiness analogously to
// file_operations::poll == NULL in Linux.
func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
@@ -153,31 +159,16 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
return syserror.ENOTSUP
}
-// LockBSD implements FileDescriptionImpl.LockBSD.
-func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
- return syserror.EBADF
-}
-
-// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
-func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
- return syserror.EBADF
-}
-
-// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
-func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
- return syserror.EBADF
-}
-
-// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
-func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
- return syserror.EBADF
-}
-
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
type DirectoryFileDescriptionDefaultImpl struct{}
+// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
+func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.EISDIR
+}
+
// PRead implements FileDescriptionImpl.PRead.
func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
return 0, syserror.EISDIR
@@ -347,7 +338,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
writable, ok := fd.data.(WritableDynamicBytesSource)
if !ok {
- return 0, syserror.EINVAL
+ return 0, syserror.EIO
}
n, err := writable.Write(ctx, src, offset)
if err != nil {
@@ -384,3 +375,54 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
fd.IncRef()
return nil
}
+
+// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
+// functions. Caller must call Init().
+type LockFD struct {
+ locks *FileLocks
+}
+
+// Init initializes fd with FileLocks to use.
+func (fd *LockFD) Init(locks *FileLocks) {
+ fd.locks = locks
+}
+
+// Locks returns the locks associated with this file.
+func (fd *LockFD) Locks() *FileLocks {
+ return fd.locks
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ return fd.locks.LockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ fd.locks.UnlockBSD(uid)
+ return nil
+}
+
+// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning ENOLCK.
+type NoLockFD struct{}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ return syserror.ENOLCK
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+ return syserror.ENOLCK
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return syserror.ENOLCK
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return syserror.ENOLCK
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 3a75d4d62..1cd607c0a 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -33,6 +33,7 @@ import (
type fileDescription struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
+ NoLockFD
}
// genCount contains the number of times its DynamicBytesSource.Generate()
@@ -79,9 +80,9 @@ type testFD struct {
data DynamicBytesSource
}
-func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
vd := vfsObj.NewAnonVirtualDentry("genCountFD")
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
var fd testFD
fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
@@ -89,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS
}
// Release implements FileDescriptionImpl.Release.
-func (fd *testFD) Release() {
+func (fd *testFD) Release(context.Context) {
}
// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
@@ -108,11 +109,11 @@ func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
- fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
- defer fd.DecRef()
+ fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{})
+ defer fd.DecRef(ctx)
// The first read causes Generate to be called to fill the FD's buffer.
buf := make([]byte, 2)
@@ -154,11 +155,11 @@ func TestGenCountFD(t *testing.T) {
}
// Write and PWrite fails.
- if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
- t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
}
- if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
- t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
}
}
@@ -166,11 +167,11 @@ func TestWritable(t *testing.T) {
ctx := contexttest.Context(t)
vfsObj := &VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
+ if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
}
- fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
- defer fd.DecRef()
+ fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"})
+ defer fd.DecRef(ctx)
buf := make([]byte, 10)
ioseq := usermem.BytesIOSequence(buf)
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 1edd584c9..df3758fd1 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -100,12 +100,12 @@ func (fs *Filesystem) TryIncRef() bool {
}
// DecRef decrements fs' reference count.
-func (fs *Filesystem) DecRef() {
+func (fs *Filesystem) DecRef(ctx context.Context) {
if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
fs.vfs.filesystemsMu.Lock()
delete(fs.vfs.filesystems, fs)
fs.vfs.filesystemsMu.Unlock()
- fs.impl.Release()
+ fs.impl.Release(ctx)
} else if refs < 0 {
panic("Filesystem.decRef() called without holding a reference")
}
@@ -149,7 +149,7 @@ func (fs *Filesystem) DecRef() {
type FilesystemImpl interface {
// Release is called when the associated Filesystem reaches zero
// references.
- Release()
+ Release(ctx context.Context)
// Sync "causes all pending modifications to filesystem metadata and cached
// file data to be written to the underlying [filesystem]", as by syncfs(2).
@@ -524,8 +524,6 @@ type FilesystemImpl interface {
//
// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
-
- // TODO(gvisor.dev/issue/1479): inotify_add_watch()
}
// PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
new file mode 100644
index 000000000..e7da49faa
--- /dev/null
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -0,0 +1,210 @@
+# Inotify
+
+Inotify is a mechanism for monitoring filesystem events in Linux--see
+inotify(7). An inotify instance can be used to monitor files and directories for
+modifications, creation/deletion, etc. The inotify API consists of system calls
+that create inotify instances (inotify_init/inotify_init1) and add/remove
+watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are
+generated from various places in the sentry, including the syscall layer, the
+vfs layer, the process fd table, and within each filesystem implementation. This
+document outlines the implementation details of inotify in VFS2.
+
+## Inotify Objects
+
+Inotify data structures are implemented in the vfs package.
+
+### vfs.Inotify
+
+Inotify instances are represented by vfs.Inotify objects, which implement
+vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a
+pseudo-filesystem (anonfs). Each inotify instance receives events from a set of
+vfs.Watch objects, which can be modified with inotify_add_watch(2) and
+inotify_rm_watch(2). An application can retrieve events by reading the inotify
+fd.
+
+### vfs.Watches
+
+The set of all watches held on a single file (i.e., the watch target) is stored
+in vfs.Watches. Each watch will belong to a different inotify instance (an
+instance can only have one watch on any watch target). The watches are stored in
+a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
+to a single file will all share the same vfs.Watches. Activity on the target
+causes its vfs.Watches to generate notifications on its watches’ inotify
+instances.
+
+### vfs.Watch
+
+A single watch, owned by one inotify instance and applied to one watch target.
+Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch,
+which leads to some complicated locking behavior (see Lock Ordering). Whenever a
+watch is notified of an event on its target, it will queue events to its inotify
+instance for delivery to the user.
+
+### vfs.Event
+
+vfs.Event is a simple struct encapsulating all the fields for an inotify event.
+It is generated by vfs.Watches and forwarded to the watches' owners. It is
+serialized to the user during read(2) syscalls on the associated fs.Inotify's
+fd.
+
+## Lock Ordering
+
+There are three locks related to the inotify implementation:
+
+Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue
+lock. Watches.mu: the watch set lock, used to protect the collection of watches
+on a target.
+
+The correct lock ordering for inotify code is:
+
+Inotify.mu -> Watches.mu -> Inotify.evMu.
+
+Note that we use a distinct lock to protect the inotify event queue. If we
+simply used Inotify.mu, we could simultaneously have locks being acquired in the
+order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would
+cause deadlocks. For instance, adding a watch to an inotify instance would
+require locking Inotify.mu, and then adding the same watch to the target would
+cause Watches.mu to be held. At the same time, generating an event on the target
+would require Watches.mu to be held before iterating through each watch, and
+then notifying the owner of each watch would cause Inotify.mu to be held.
+
+See the vfs package comment to understand how inotify locks fit into the overall
+ordering of filesystem locks.
+
+## Watch Targets in Different Filesystem Implementations
+
+In Linux, watches reside on inodes at the virtual filesystem layer. As a result,
+all hard links and file descriptions on a single file will all share the same
+watch set. In VFS2, there is no common inode structure across filesystem types
+(some may not even have inodes), so we have to plumb inotify support through
+each specific filesystem implementation. Some of the technical considerations
+are outlined below.
+
+### Tmpfs
+
+For filesystems with inodes, like tmpfs, the design is quite similar to that of
+Linux, where watches reside on the inode.
+
+### Pseudo-filesystems
+
+Technically, because inotify is implemented at the vfs layer in Linux,
+pseudo-filesystems on top of kernfs support inotify passively. However, watches
+can only track explicit filesystem operations like read/write, open/close,
+mknod, etc., so watches on a target like /proc/self/fd will not generate events
+every time a new fd is added or removed. As of this writing, we leave inotify
+unimplemented in kernfs and anonfs; it does not seem particularly useful.
+
+### Gofer Filesystem (fsimpl/gofer)
+
+The gofer filesystem has several traits that make it difficult to support
+inotify:
+
+* **There are no inodes.** A file is represented as a dentry that holds an
+ unopened p9 file (and possibly an open FID), through which the Sentry
+ interacts with the gofer.
+ * *Solution:* Because there is no inode structure stored in the sandbox,
+ inotify watches must be held on the dentry. This would be an issue in
+ the presence of hard links, where multiple dentries would need to share
+ the same set of watches, but in VFS2, we do not support the internal
+ creation of hard links on gofer fs. As a result, we make the assumption
+ that every dentry corresponds to a unique inode. However, the next point
+ raises an issue with this assumption:
+* **The Sentry cannot always be aware of hard links on the remote
+ filesystem.** There is no way for us to confirm whether two files on the
+ remote filesystem are actually links to the same inode. QIDs and inodes are
+ not always 1:1. The assumption that dentries and inodes are 1:1 is
+ inevitably broken if there are remote hard links that we cannot detect.
+ * *Solution:* this is an issue with gofer fs in general, not only inotify,
+ and we will have to live with it.
+* **Dentries can be cached, and then evicted.** Dentry lifetime does not
+ correspond to file lifetime. Because gofer fs is not entirely in-memory, the
+ absence of a dentry does not mean that the corresponding file does not
+ exist, nor does a dentry reaching zero references mean that the
+ corresponding file no longer exists. When a dentry reaches zero references,
+ it will be cached, in case the file at that path is needed again in the
+ future. However, the dentry may be evicted from the cache, which will cause
+ a new dentry to be created next time the same file path is used. The
+ existing watches will be lost.
+ * *Solution:* When a dentry reaches zero references, do not cache it if it
+ has any watches, so we can avoid eviction/destruction. Note that if the
+ dentry was deleted or invalidated (d.vfsd.IsDead()), we should still
+ destroy it along with its watches. Additionally, when a dentry’s last
+ watch is removed, we cache it if it also has zero references. This way,
+ the dentry can eventually be evicted from memory if it is no longer
+ needed.
+* **Dentries can be invalidated.** Another issue with dentry lifetime is that
+ the remote file at the file path represented may change from underneath the
+ dentry. In this case, the next time that the dentry is used, it will be
+ invalidated and a new dentry will replace it. In this case, it is not clear
+ what should be done with the watches on the old dentry.
+ * *Solution:* Silently destroy the watches when invalidation occurs. We
+ have no way of knowing exactly what happened, when it happens. Inotify
+ instances on NFS files in Linux probably behave in a similar fashion,
+ since inotify is implemented at the vfs layer and is not aware of the
+ complexities of remote file systems.
+ * An alternative would be to issue some kind of event upon invalidation,
+ e.g. a delete event, but this has several issues:
+ * We cannot discern whether the remote file was invalidated because it was
+ moved, deleted, etc. This information is crucial, because these cases
+ should result in different events. Furthermore, the watches should only
+ be destroyed if the file has been deleted.
+ * Moreover, the mechanism for detecting whether the underlying file has
+ changed is to check whether a new QID is given by the gofer. This may
+ result in false positives, e.g. suppose that the server closed and
+ re-opened the same file, which may result in a new QID.
+ * Finally, the time of the event may be completely different from the time
+ of the file modification, since a dentry is not immediately notified
+ when the underlying file has changed. It would be quite unexpected to
+ receive the notification when invalidation was triggered, i.e. the next
+ time the file was accessed within the sandbox, because then the
+ read/write/etc. operation on the file would not result in the expected
+ event.
+ * Another point in favor of the first solution: inotify in Linux can
+ already be lossy on local filesystems (one of the sacrifices made so
+ that filesystem performance isn’t killed), and it is lossy on NFS for
+ similar reasons to gofer fs. Therefore, it is better for inotify to be
+ silent than to emit incorrect notifications.
+* **There may be external users of the remote filesystem.** We can only track
+ operations performed on the file within the sandbox. This is sufficient
+ under InteropModeExclusive, but whenever there are external users, the set
+ of actions we are aware of is incomplete.
+ * *Solution:* We could either return an error or just issue a warning when
+ inotify is used without InteropModeExclusive. Although faulty, VFS1
+ allows it when the filesystem is shared, and Linux does the same for
+ remote filesystems (as mentioned above, inotify sits at the vfs level).
+
+## Dentry Interface
+
+For events that must be generated above the vfs layer, we provide the following
+DentryImpl methods to allow interactions with targets on any FilesystemImpl:
+
+* **InotifyWithParent()** generates events on the dentry’s watches as well as
+ its parent’s.
+* **Watches()** retrieves the watch set of the target represented by the
+ dentry. This is used to access and modify watches on a target.
+* **OnZeroWatches()** performs cleanup tasks after the last watch is removed
+ from a dentry. This is needed by gofer fs, which must allow a watched dentry
+ to be cached once it has no more watches. Most implementations can just do
+ nothing. Note that OnZeroWatches() must be called after all inotify locks
+ are released to preserve lock ordering, since it may acquire
+ FilesystemImpl-specific locks.
+
+## IN_EXCL_UNLINK
+
+There are several options that can be set for a watch, specified as part of the
+mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some
+additional support in each filesystem.
+
+A watch with IN_EXCL_UNLINK will not generate events for its target if it
+corresponds to a path that was unlinked. For instance, if an fd is opened on
+“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the
+fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This
+requires each DentryImpl to keep track of whether it has been unlinked, in order
+to determine whether events should be sent to watches with IN_EXCL_UNLINK.
+
+## IN_ONESHOT
+
+One-shot watches expire after generating a single event. When an event occurs,
+all one-shot watches on the target that successfully generated an event are
+removed. Lock ordering can cause the management of one-shot watches to be quite
+expensive; see Watches.Notify() for more information.
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 286510195..8882fa84a 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -43,7 +43,7 @@ type Dentry struct {
// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
// either d2's parent or an ancestor of d2's parent.
func IsAncestorDentry(d, d2 *Dentry) bool {
- for {
+ for d2 != nil { // Stop at root, where d2.parent == nil.
if d2.parent == d {
return true
}
@@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool {
}
d2 = d2.parent
}
+ return false
}
// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..aff220a61
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,774 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "bytes"
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+ PathEvent EventType = iota
+ InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// +stateify savable
+type Inotify struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+ NoLockFD
+
+ // Unique identifier for this inotify instance. We don't just reuse the
+ // inotify fd because fds can be duped. These should not be exposed to the
+ // user, since we may aggressively reuse an id on S/R.
+ id uint64
+
+ // queue is used to notify interested parties when the inotify instance
+ // becomes readable or writable.
+ queue waiter.Queue `state:"nosave"`
+
+ // evMu *only* protects the events list. We need a separate lock while
+ // queuing events: using mu may violate lock ordering, since at that point
+ // the calling goroutine may already hold Watches.mu.
+ evMu sync.Mutex `state:"nosave"`
+
+ // A list of pending events for this inotify instance. Protected by evMu.
+ events eventList
+
+ // A scratch buffer, used to serialize inotify events. Allocate this
+ // ahead of time for the sake of performance. Protected by evMu.
+ scratch []byte
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+ // instance. Note that Linux starts numbering watch descriptors from 1.
+ nextWatchMinusOne int32
+
+ // Map from watch descriptors to watch objects.
+ watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+ // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+ flags &^= linux.O_CLOEXEC
+ if flags&^linux.O_NONBLOCK != 0 {
+ return nil, syserror.EINVAL
+ }
+
+ id := uniqueid.GlobalFromContext(ctx)
+ vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+ defer vd.DecRef(ctx)
+ fd := &Inotify{
+ id: id,
+ scratch: make([]byte, inotifyEventBaseSize),
+ watches: make(map[int32]*Watch),
+ }
+ if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release(ctx context.Context) {
+ var ds []*Dentry
+
+ // We need to hold i.mu to avoid a race with concurrent calls to
+ // Inotify.handleDeletion from Watches. There's no risk of Watches
+ // accessing this Inotify after the destructor ends, because we remove all
+ // references to it below.
+ i.mu.Lock()
+ for _, w := range i.watches {
+ // Remove references to the watch from the watches set on the target. We
+ // don't need to worry about the references from i.watches, since this
+ // file description is about to be destroyed.
+ d := w.target
+ ws := d.Watches()
+ // Watchable dentries should never return a nil watch set.
+ if ws == nil {
+ panic("Cannot remove watch from an unwatchable dentry")
+ }
+ ws.Remove(i.id)
+ if ws.Size() == 0 {
+ ds = append(ds, d)
+ }
+ }
+ i.mu.Unlock()
+
+ for _, d := range ds {
+ d.OnZeroWatches(ctx)
+ }
+}
+
+// Allocate implements FileDescription.Allocate.
+func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ panic("Allocate should not be called on read-only inotify fds")
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+ i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if !i.events.Empty() {
+ ready |= waiter.EventIn
+ }
+
+ return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if dst.NumBytes() < inotifyEventBaseSize {
+ return 0, syserror.EINVAL
+ }
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if i.events.Empty() {
+ // Nothing to read yet, tell caller to block.
+ return 0, syserror.ErrWouldBlock
+ }
+
+ var writeLen int64
+ for it := i.events.Front(); it != nil; {
+ // Advance `it` before the element is removed from the list, or else
+ // it.Next() will always be nil.
+ event := it
+ it = it.Next()
+
+ // Does the buffer have enough remaining space to hold the event we're
+ // about to write out?
+ if dst.NumBytes() < int64(event.sizeOf()) {
+ if writeLen > 0 {
+ // Buffer wasn't big enough for all pending events, but we did
+ // write some events out.
+ return writeLen, nil
+ }
+ return 0, syserror.EINVAL
+ }
+
+ // Linux always dequeues an available event as long as there's enough
+ // buffer space to copy it out, even if the copy below fails. Emulate
+ // this behaviour.
+ i.events.Remove(event)
+
+ // Buffer has enough space, copy event to the read buffer.
+ n, err := event.CopyTo(ctx, i.scratch, dst)
+ if err != nil {
+ return 0, err
+ }
+
+ writeLen += n
+ dst = dst.DropFirst64(n)
+ }
+ return writeLen, nil
+}
+
+// Ioctl implements FileDescriptionImpl.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch args[1].Int() {
+ case linux.FIONREAD:
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+ var n uint32
+ for e := i.events.Front(); e != nil; e = e.Next() {
+ n += uint32(e.sizeOf())
+ }
+ var buf [4]byte
+ usermem.ByteOrder.PutUint32(buf[:], n)
+ _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+ i.evMu.Lock()
+
+ // Check if we should coalesce the event we're about to queue with the last
+ // one currently in the queue. Events are coalesced if they are identical.
+ if last := i.events.Back(); last != nil {
+ if ev.equals(last) {
+ // "Coalesce" the two events by simply not queuing the new one. We
+ // don't need to raise a waiter.EventIn notification because no new
+ // data is available for reading.
+ i.evMu.Unlock()
+ return
+ }
+ }
+
+ i.events.PushBack(ev)
+
+ // Release mutex before notifying waiters because we don't control what they
+ // can do.
+ i.evMu.Unlock()
+
+ i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked. ws must be the watch set for target d.
+func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch {
+ w := &Watch{
+ owner: i,
+ wd: i.nextWatchIDLocked(),
+ target: d,
+ mask: mask,
+ }
+
+ // Hold the watch in this inotify instance as well as the watch set on the
+ // target.
+ i.watches[w.wd] = w
+ ws.Add(w)
+ return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+ i.nextWatchMinusOne++
+ return i.nextWatchMinusOne
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+//
+// The caller must hold a reference on target.
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
+ // Note: Locking this inotify instance protects the result returned by
+ // Lookup() below. With the lock held, we know for sure the lookup result
+ // won't become stale because it's impossible for *this* instance to
+ // add/remove watches on target.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+
+ ws := target.Watches()
+ if ws == nil {
+ // While Linux supports inotify watches on all filesystem types, watches on
+ // filesystems like kernfs are not generally useful, so we do not.
+ return 0, syserror.EPERM
+ }
+ // Does the target already have a watch from this inotify instance?
+ if existing := ws.Lookup(i.id); existing != nil {
+ newmask := mask
+ if mask&linux.IN_MASK_ADD != 0 {
+ // "Add (OR) events to watch mask for this pathname if it already
+ // exists (instead of replacing mask)." -- inotify(7)
+ newmask |= atomic.LoadUint32(&existing.mask)
+ }
+ atomic.StoreUint32(&existing.mask, newmask)
+ return existing.wd, nil
+ }
+
+ // No existing watch, create a new watch.
+ w := i.newWatchLocked(target, ws, mask)
+ return w.wd, nil
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(ctx context.Context, wd int32) error {
+ i.mu.Lock()
+
+ // Find the watch we were asked to removed.
+ w, ok := i.watches[wd]
+ if !ok {
+ i.mu.Unlock()
+ return syserror.EINVAL
+ }
+
+ // Remove the watch from this instance.
+ delete(i.watches, wd)
+
+ // Remove the watch from the watch target.
+ ws := w.target.Watches()
+ // AddWatch ensures that w.target has a non-nil watch set.
+ if ws == nil {
+ panic("Watched dentry cannot have nil watch set")
+ }
+ ws.Remove(w.OwnerID())
+ remaining := ws.Size()
+ i.mu.Unlock()
+
+ if remaining == 0 {
+ w.target.OnZeroWatches(ctx)
+ }
+
+ // Generate the event for the removal.
+ i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+ return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+ // mu protects the fields below.
+ mu sync.RWMutex `state:"nosave"`
+
+ // ws is the map of active watches in this collection, keyed by the inotify
+ // instance id of the owner.
+ ws map[uint64]*Watch
+}
+
+// Size returns the number of watches held by w.
+func (w *Watches) Size() int {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return len(w.ws)
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ owner := watch.OwnerID()
+ // Sanity check, we should never have two watches for one owner on the
+ // same target.
+ if _, exists := w.ws[owner]; exists {
+ panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+ }
+ if w.ws == nil {
+ w.ws = make(map[uint64]*Watch)
+ }
+ w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ if w.ws == nil {
+ // This watch set is being destroyed. The thread executing the
+ // destructor is already in the process of deleting all our watches. We
+ // got here with no references on the target because we raced with the
+ // destructor notifying all the watch owners of destruction. See the
+ // comment in Watches.HandleDeletion for why this race exists.
+ return
+ }
+
+ // It is possible for w.Remove() to be called for the same watch multiple
+ // times. See the treatment of one-shot watches in Watches.Notify().
+ if _, ok := w.ws[id]; ok {
+ delete(w.ws, id)
+ }
+}
+
+// Notify queues a new event with watches in this set. Watches with
+// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
+// unlinked.
+func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) {
+ var hasExpired bool
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
+ continue
+ }
+ if watch.Notify(name, events, cookie) {
+ hasExpired = true
+ }
+ }
+ w.mu.RUnlock()
+
+ if hasExpired {
+ w.cleanupExpiredWatches(ctx)
+ }
+}
+
+// This function is relatively expensive and should only be called where there
+// are expired watches.
+func (w *Watches) cleanupExpiredWatches(ctx context.Context) {
+ // Because of lock ordering, we cannot acquire Inotify.mu for each watch
+ // owner while holding w.mu. As a result, store expired watches locally
+ // before removing.
+ var toRemove []*Watch
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ if atomic.LoadInt32(&watch.expired) == 1 {
+ toRemove = append(toRemove, watch)
+ }
+ }
+ w.mu.RUnlock()
+ for _, watch := range toRemove {
+ watch.owner.RmWatch(ctx, watch.wd)
+ }
+}
+
+// HandleDeletion is called when the watch target is destroyed. Clear the
+// watch set, detach watches from the inotify instances they belong to, and
+// generate the appropriate events.
+func (w *Watches) HandleDeletion(ctx context.Context) {
+ w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)
+
+ // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
+ // the owner of each watch being deleted. Instead, atomically store the
+ // watches map in a local variable and set it to nil so we can iterate over
+ // it with the assurance that there will be no concurrent accesses.
+ var ws map[uint64]*Watch
+ w.mu.Lock()
+ ws = w.ws
+ w.ws = nil
+ w.mu.Unlock()
+
+ // Remove each watch from its owner's watch set, and generate a corresponding
+ // watch removal event.
+ for _, watch := range ws {
+ i := watch.owner
+ i.mu.Lock()
+ _, found := i.watches[watch.wd]
+ delete(i.watches, watch.wd)
+
+ // Release mutex before notifying waiters because we don't control what
+ // they can do.
+ i.mu.Unlock()
+
+ // If watch was not found, it was removed from the inotify instance before
+ // we could get to it, in which case we should not generate an event.
+ if found {
+ i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
+ }
+ }
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+ // Inotify instance which owns this watch.
+ //
+ // This field is immutable after creation.
+ owner *Inotify
+
+ // Descriptor for this watch. This is unique across an inotify instance.
+ //
+ // This field is immutable after creation.
+ wd int32
+
+ // target is a dentry representing the watch target. Its watch set contains this watch.
+ //
+ // This field is immutable after creation.
+ target *Dentry
+
+ // Events being monitored via this watch. Must be accessed with atomic
+ // memory operations.
+ mask uint32
+
+ // expired is set to 1 to indicate that this watch is a one-shot that has
+ // already sent a notification and therefore can be removed. Must be accessed
+ // with atomic memory operations.
+ expired int32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+ return w.owner.id
+}
+
+// ExcludeUnlinked indicates whether the watched object should continue to be
+// notified of events originating from a path that has been unlinked.
+//
+// For example, if "foo/bar" is opened and then unlinked, operations on the
+// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK.
+func (w *Watch) ExcludeUnlinked() bool {
+ return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch. Returns true if this is a one-shot
+// watch that should be deleted, after this event was successfully queued.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
+ if atomic.LoadInt32(&w.expired) == 1 {
+ // This is a one-shot watch that is already in the process of being
+ // removed. This may happen if a second event reaches the watch target
+ // before this watch has been removed.
+ return false
+ }
+
+ mask := atomic.LoadUint32(&w.mask)
+ if mask&events == 0 {
+ // We weren't watching for this event.
+ return false
+ }
+
+ // Event mask should include bits matched from the watch plus all control
+ // event bits.
+ unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+ effectiveMask := unmaskableBits | mask
+ matchedEvents := effectiveMask & events
+ w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+ if mask&linux.IN_ONESHOT != 0 {
+ atomic.StoreInt32(&w.expired, 1)
+ return true
+ }
+ return false
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+ eventEntry
+
+ wd int32
+ mask uint32
+ cookie uint32
+
+ // len is computed based on the name field is set automatically by
+ // Event.setName. It should be 0 when no name is set; otherwise it is the
+ // length of the name slice.
+ len uint32
+
+ // The name field has special padding requirements and should only be set by
+ // calling Event.setName.
+ name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+ e := &Event{
+ wd: wd,
+ mask: events,
+ cookie: cookie,
+ }
+ if name != "" {
+ e.setName(name)
+ }
+ return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+ if l < uint32(len(s)+1) {
+ panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+ }
+ b := make([]byte, l)
+ copy(b, s)
+
+ // b was zero-value initialized during make(), so the rest of the slice is
+ // already filled with null bytes.
+
+ return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+ // We need to pad the name such that the entire event length ends up a
+ // multiple of inotifyEventBaseSize.
+ unpaddedLen := len(name) + 1
+ // Round up to nearest multiple of inotifyEventBaseSize.
+ e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+ // Make sure we haven't overflowed and wrapped around when rounding.
+ if unpaddedLen > int(e.len) {
+ panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+ }
+ e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+ s := inotifyEventBaseSize + int(e.len)
+ if s < inotifyEventBaseSize {
+ panic("Overflowed event size")
+ }
+ return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+ usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+ usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+ usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+ usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+ writeLen := 0
+
+ n, err := dst.CopyOut(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ dst = dst.DropFirst(n)
+
+ if e.len > 0 {
+ n, err = dst.CopyOut(ctx, e.name)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ }
+
+ // Santiy check.
+ if writeLen != e.sizeOf() {
+ panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+ }
+
+ return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+ return e.wd == other.wd &&
+ e.mask == other.mask &&
+ e.cookie == other.cookie &&
+ e.len == other.len &&
+ bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+ var ev uint32
+ if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+ ev |= linux.IN_ATTRIB
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ ev |= linux.IN_MODIFY
+ }
+
+ if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+ // Both times indicates a utime(s) call.
+ ev |= linux.IN_ATTRIB
+ } else if mask&linux.STATX_ATIME != 0 {
+ ev |= linux.IN_ACCESS
+ } else if mask&linux.STATX_MTIME != 0 {
+ mask |= linux.IN_MODIFY
+ }
+ return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent. Note that unlike most pairs of
+// parent/child notifications, the child is notified first in this case.
+func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) {
+ if self != nil {
+ self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */)
+ }
+ if parent != nil {
+ parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */)
+ }
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+ var dirEv uint32
+ if isDir {
+ dirEv = linux.IN_ISDIR
+ }
+ cookie := uniqueid.InotifyCookie(ctx)
+ if oldParent != nil {
+ oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */)
+ }
+ if newParent != nil {
+ newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */)
+ }
+ // Somewhat surprisingly, self move events do not have a cookie.
+ if renamed != nil {
+ renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */)
+ }
+}
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock.go
index 724dfe743..6c7583a81 100644
--- a/pkg/sentry/vfs/lock/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -17,9 +17,11 @@
//
// The actual implementations can be found in the lock package under
// sentry/fs/lock.
-package lock
+package vfs
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -56,7 +58,11 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
}
// LockPOSIX tries to acquire a POSIX-style lock on a file region.
-func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ rng, err := computeRange(ctx, fd, start, length, whence)
+ if err != nil {
+ return err
+ }
if fl.posix.LockRegion(uid, t, rng, block) {
return nil
}
@@ -67,6 +73,37 @@ func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fsloc
//
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
-func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ rng, err := computeRange(ctx, fd, start, length, whence)
+ if err != nil {
+ return err
+ }
fl.posix.UnlockRegion(uid, rng)
+ return nil
+}
+
+func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) {
+ var off int64
+ switch whence {
+ case linux.SEEK_SET:
+ off = 0
+ case linux.SEEK_CUR:
+ // Note that Linux does not hold any mutexes while retrieving the file
+ // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+ curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
+ if err != nil {
+ return fslock.LockRange{}, err
+ }
+ off = curOff
+ case linux.SEEK_END:
+ stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
+ if err != nil {
+ return fslock.LockRange{}, err
+ }
+ off = int64(stat.Size)
+ default:
+ return fslock.LockRange{}, syserror.EINVAL
+ }
+
+ return fslock.ComputeRange(int64(start), int64(length), off)
}
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
deleted file mode 100644
index d9ab063b7..000000000
--- a/pkg/sentry/vfs/lock/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
- name = "lock",
- srcs = ["lock.go"],
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/sentry/fs/lock",
- "//pkg/syserror",
- ],
-)
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 02850b65c..d1d29d0cd 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -28,9 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-// lastMountID is used to allocate mount ids. Must be accessed atomically.
-var lastMountID uint64
-
// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
// (Mount.fs), which applies to path resolution in the context of a particular
@@ -58,6 +55,10 @@ type Mount struct {
// ID is the immutable mount ID.
ID uint64
+ // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+ // for MS_RDONLY which is tracked in "writers". Immutable.
+ Flags MountFlags
+
// key is protected by VirtualFilesystem.mountMu and
// VirtualFilesystem.mounts.seq, and may be nil. References are held on
// key.parent and key.point if they are not nil.
@@ -84,10 +85,6 @@ type Mount struct {
// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
umounted bool
- // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
- // for MS_RDONLY which is tracked in "writers".
- flags MountFlags
-
// The lower 63 bits of writers is the number of calls to
// Mount.CheckBeginWrite() that have not yet been paired with a call to
// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
@@ -97,11 +94,11 @@ type Mount struct {
func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
mnt := &Mount{
- ID: atomic.AddUint64(&lastMountID, 1),
+ ID: atomic.AddUint64(&vfs.lastMountID, 1),
+ Flags: opts.Flags,
vfs: vfs,
fs: fs,
root: root,
- flags: opts.Flags,
ns: mntns,
refs: 1,
}
@@ -111,8 +108,17 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
return mnt
}
-// A MountNamespace is a collection of Mounts.
-//
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+ mnt.vfs.mountMu.Lock()
+ defer mnt.vfs.mountMu.Unlock()
+ return MountOptions{
+ Flags: mnt.Flags,
+ ReadOnly: mnt.readOnly(),
+ }
+}
+
+// A MountNamespace is a collection of Mounts.//
// MountNamespaces are reference-counted. Unless otherwise specified, all
// MountNamespace methods require that a reference is held.
//
@@ -120,6 +126,9 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
//
// +stateify savable
type MountNamespace struct {
+ // Owner is the usernamespace that owns this mount namespace.
+ Owner *auth.UserNamespace
+
// root is the MountNamespace's root mount. root is immutable.
root *Mount
@@ -148,7 +157,7 @@ type MountNamespace struct {
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
- ctx.Warningf("Unknown filesystem: %s", fsTypeName)
+ ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -156,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
return nil, err
}
mntns := &MountNamespace{
+ Owner: creds.UserNamespace,
refs: 1,
mountpoints: make(map[*Dentry]uint32),
}
@@ -175,26 +185,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry,
return newMount(vfs, fs, root, nil /* mntns */, opts), nil
}
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
if !opts.InternalMount && !rft.opts.AllowUserMount {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
- return err
+ return nil, err
}
+ defer root.DecRef(ctx)
+ defer fs.DecRef(ctx)
+ return vfs.NewDisconnectedMount(fs, root, opts)
+}
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
// lock ordering.
vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
if err != nil {
- root.DecRef()
- fs.DecRef()
return err
}
vfs.mountMu.Lock()
@@ -203,9 +221,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
if vd.dentry.dead {
vd.dentry.mu.Unlock()
vfs.mountMu.Unlock()
- vd.DecRef()
- root.DecRef()
- fs.DecRef()
+ vd.DecRef(ctx)
return syserror.ENOENT
}
// vd might have been mounted over between vfs.GetDentryAt() and
@@ -227,7 +243,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
// This can't fail since we're holding vfs.mountMu.
nextmnt.root.IncRef()
vd.dentry.mu.Unlock()
- vd.DecRef()
+ vd.DecRef(ctx)
vd = VirtualDentry{
mount: nextmnt,
dentry: nextmnt.root,
@@ -238,7 +254,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
// point and the mount root are directories, or neither are, and returns
// ENOTDIR if this is not the case.
mntns := vd.mount.ns
- mnt := newMount(vfs, fs, root, mntns, opts)
vfs.mounts.seq.BeginWrite()
vfs.connectLocked(mnt, vd, mntns)
vfs.mounts.seq.EndWrite()
@@ -247,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
return nil
}
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+ mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+ if err != nil {
+ return err
+ }
+ defer mnt.DecRef(ctx)
+ if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+ return err
+ }
+ return nil
+}
+
// UmountAt removes the Mount at the given path.
func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -254,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
}
// MNT_FORCE is currently unimplemented except for the permission check.
+ // Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+ // namespace, and not in the owner user namespace for the target mount. See
+ // fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
return syserror.EPERM
}
@@ -262,13 +293,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
if err != nil {
return err
}
- defer vd.DecRef()
+ defer vd.DecRef(ctx)
if vd.dentry != vd.mount.root {
return syserror.EINVAL
}
vfs.mountMu.Lock()
if mntns := MountNamespaceFromContext(ctx); mntns != nil {
- defer mntns.DecRef()
+ defer mntns.DecRef(ctx)
if mntns != vd.mount.ns {
vfs.mountMu.Unlock()
return syserror.EINVAL
@@ -304,10 +335,10 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
return nil
}
@@ -369,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
// references held by vd.
//
// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+ if checkInvariants {
+ if mnt.parent() != nil {
+ panic("VFS.connectLocked called on connected mount")
+ }
+ }
+ mnt.IncRef() // dropped by callers of umountRecursiveLocked
mnt.storeKey(vd)
if vd.mount.children == nil {
vd.mount.children = make(map[*Mount]struct{})
}
vd.mount.children[mnt] = struct{}{}
atomic.AddUint32(&vd.dentry.mounts, 1)
+ mnt.ns = mntns
mntns.mountpoints[vd.dentry]++
vfs.mounts.insertSeqed(mnt)
vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
@@ -394,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
// writer critical section. mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
vd := mnt.loadKey()
+ if checkInvariants {
+ if vd.mount != nil {
+ panic("VFS.disconnectLocked called on disconnected mount")
+ }
+ }
mnt.storeKey(VirtualDentry{})
delete(vd.mount.children, mnt)
atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
@@ -435,7 +479,7 @@ func (mnt *Mount) IncRef() {
}
// DecRef decrements mnt's reference count.
-func (mnt *Mount) DecRef() {
+func (mnt *Mount) DecRef(ctx context.Context) {
refs := atomic.AddInt64(&mnt.refs, -1)
if refs&^math.MinInt64 == 0 { // mask out MSB
var vd VirtualDentry
@@ -446,10 +490,10 @@ func (mnt *Mount) DecRef() {
mnt.vfs.mounts.seq.EndWrite()
mnt.vfs.mountMu.Unlock()
}
- mnt.root.DecRef()
- mnt.fs.DecRef()
+ mnt.root.DecRef(ctx)
+ mnt.fs.DecRef(ctx)
if vd.Ok() {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}
}
@@ -462,7 +506,7 @@ func (mntns *MountNamespace) IncRef() {
}
// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
+func (mntns *MountNamespace) DecRef(ctx context.Context) {
vfs := mntns.root.fs.VirtualFilesystem()
if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
vfs.mountMu.Lock()
@@ -473,10 +517,10 @@ func (mntns *MountNamespace) DecRef() {
vfs.mounts.seq.EndWrite()
vfs.mountMu.Unlock()
for _, vd := range vdsToDecRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
for _, mnt := range mountsToDecRef {
- mnt.DecRef()
+ mnt.DecRef(ctx)
}
} else if refs < 0 {
panic("MountNamespace.DecRef() called without holding a reference")
@@ -490,7 +534,7 @@ func (mntns *MountNamespace) DecRef() {
// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
//
// Preconditions: References are held on mnt and d.
-func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
+func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
// The first mount is special-cased:
//
// - The caller is assumed to have checked d.isMounted() already. (This
@@ -521,7 +565,7 @@ retryFirst:
// Raced with umount.
continue
}
- mnt.DecRef()
+ mnt.DecRef(ctx)
mnt = next
d = next.root
}
@@ -534,7 +578,7 @@ retryFirst:
//
// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
// mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
+func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
// The first mount is special-cased:
//
// - The caller must have already checked mnt against vfsroot.
@@ -558,12 +602,12 @@ retryFirst:
if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can only
// happen due to a racing change to Mount.key.
- parent.DecRef()
+ parent.DecRef(ctx)
goto retryFirst
}
if !vfs.mounts.seq.ReadOk(epoch) {
- point.DecRef()
- parent.DecRef()
+ point.DecRef(ctx)
+ parent.DecRef(ctx)
goto retryFirst
}
mnt = parent
@@ -591,16 +635,16 @@ retryFirst:
if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can
// only happen due to a racing change to Mount.key.
- parent.DecRef()
+ parent.DecRef(ctx)
goto retryNotFirst
}
if !vfs.mounts.seq.ReadOk(epoch) {
- point.DecRef()
- parent.DecRef()
+ point.DecRef(ctx)
+ parent.DecRef(ctx)
goto retryNotFirst
}
- d.DecRef()
- mnt.DecRef()
+ d.DecRef(ctx)
+ mnt.DecRef(ctx)
mnt = parent
d = point
}
@@ -715,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
if mnt.readOnly() {
opts = "ro"
}
- if mnt.flags.NoExec {
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
opts += ",noexec"
}
@@ -800,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
if mnt.readOnly() {
opts = "ro"
}
- if mnt.flags.NoExec {
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
opts += ",noexec"
}
- // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
- // set.
fmt.Fprintf(buf, "%s ", opts)
// (7) Optional fields: zero or more fields of the form "tag[:value]".
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bc7581698..70f850ca4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.15
+// +build !go1.16
// Check go:linkname function signatures when updating Go version.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 53d364c5c..dfc8573fd 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -75,6 +75,21 @@ type MknodOptions struct {
type MountFlags struct {
// NoExec is equivalent to MS_NOEXEC.
NoExec bool
+
+ // NoATime is equivalent to MS_NOATIME and indicates that the
+ // filesystem should not update access time in-place.
+ NoATime bool
+
+ // NoDev is equivalent to MS_NODEV and indicates that the
+ // filesystem should not allow access to devices (special files).
+ // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE
+ // filesystems.
+ NoDev bool
+
+ // NoSUID is equivalent to MS_NOSUID and indicates that the
+ // filesystem should not honor set-user-ID and set-group-ID bits or
+ // file capabilities when executing programs.
+ NoSUID bool
}
// MountOptions contains options to VirtualFilesystem.MountAt().
@@ -149,6 +164,12 @@ type SetStatOptions struct {
// == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
// instead).
Stat linux.Statx
+
+ // NeedWritePerm indicates that write permission on the file is needed for
+ // this operation. This is needed for truncate(2) (note that ftruncate(2)
+ // does not require the same check--instead, it checks that the fd is
+ // writable).
+ NeedWritePerm bool
}
// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index cd78d66bc..e4da15009 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot,
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
@@ -64,12 +64,12 @@ loop:
// of FilesystemImpl.PrependPath() may return nil instead.
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
break loop
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
@@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
loop:
@@ -112,12 +112,12 @@ loop:
if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
return "", nil
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
@@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd
haveRef := false
defer func() {
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
}()
unreachable := false
@@ -157,13 +157,13 @@ loop:
if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
break loop
}
- nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
if !nextVD.Ok() {
unreachable = true
break loop
}
if haveRef {
- vd.DecRef()
+ vd.DecRef(ctx)
}
vd = nextVD
haveRef = true
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f9647f90e..33389c1df 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -94,6 +94,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu
return syserror.EACCES
}
+// MayLink determines whether creating a hard link to a file with the given
+// mode, kuid, and kgid is permitted.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ // Source inode owner can hardlink all they like; otherwise, it must be a
+ // safe source.
+ if CanActAsOwner(creds, kuid) {
+ return nil
+ }
+
+ // Only regular files can be hard linked.
+ if mode.FileType() != linux.S_IFREG {
+ return syserror.EPERM
+ }
+
+ // Setuid files should not get pinned to the filesystem.
+ if mode&linux.S_ISUID != 0 {
+ return syserror.EPERM
+ }
+
+ // Executable setgid files should not get pinned to the filesystem, but we
+ // don't support S_IXGRP anyway.
+
+ // Hardlinking to unreadable or unwritable sources is dangerous.
+ if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil {
+ return syserror.EPERM
+ }
+ return nil
+}
+
// AccessTypesForOpenFlags returns the access types required to open a file
// with the given OpenOptions.Flags. Note that this is NOT the same thing as
// the set of accesses permitted for the opened file:
@@ -152,7 +183,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
// CheckSetStat checks that creds has permission to change the metadata of a
// file with the given permissions, UID, and GID as specified by stat, subject
// to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ stat := &opts.Stat
if stat.Mask&linux.STATX_SIZE != 0 {
limit, err := CheckLimit(ctx, 0, int64(stat.Size))
if err != nil {
@@ -184,6 +216,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
return syserror.EPERM
}
}
+ if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
+ if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
+ return err
+ }
+ }
if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
if !CanActAsOwner(creds, kuid) {
if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
@@ -199,6 +236,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat
return nil
}
+// CheckDeleteSticky checks whether the sticky bit is set on a directory with
+// the given file mode, and if so, checks whether creds has permission to
+// remove a file owned by childKUID from a directory with the given mode.
+// CheckDeleteSticky is consistent with fs/linux.h:check_sticky().
+func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error {
+ if parentMode&linux.ModeSticky == 0 {
+ return nil
+ }
+ if CanActAsOwner(creds, childKUID) {
+ return nil
+ }
+ return syserror.EPERM
+}
+
// CanActAsOwner returns true if creds can act as the owner of a file with the
// given owning UID, consistent with Linux's
// fs/inode.c:inode_owner_or_capable().
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 9d047ff88..3304372d9 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -18,6 +18,7 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sync"
@@ -136,31 +137,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat
return rp
}
-func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
+func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) {
rp.root = VirtualDentry{}
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = nil
rp.start = nil
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
resolvingPathPool.Put(rp)
}
-func (rp *ResolvingPath) decRefStartAndMount() {
+func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) {
if rp.flags&rpflagsHaveStartRef != 0 {
- rp.start.DecRef()
+ rp.start.DecRef(ctx)
}
if rp.flags&rpflagsHaveMountRef != 0 {
- rp.mount.DecRef()
+ rp.mount.DecRef(ctx)
}
}
-func (rp *ResolvingPath) releaseErrorState() {
+func (rp *ResolvingPath) releaseErrorState(ctx context.Context) {
if rp.nextStart != nil {
- rp.nextStart.DecRef()
+ rp.nextStart.DecRef(ctx)
rp.nextStart = nil
}
if rp.nextMount != nil {
- rp.nextMount.DecRef()
+ rp.nextMount.DecRef(ctx)
rp.nextMount = nil
}
}
@@ -236,13 +237,13 @@ func (rp *ResolvingPath) Advance() {
// Restart resets the stream of path components represented by rp to its state
// on entry to the current FilesystemImpl method.
-func (rp *ResolvingPath) Restart() {
+func (rp *ResolvingPath) Restart(ctx context.Context) {
rp.pit = rp.origParts[rp.numOrigParts-1]
rp.mustBeDir = rp.mustBeDirOrig
rp.symlinks = rp.symlinksOrig
rp.curPart = rp.numOrigParts - 1
copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
}
func (rp *ResolvingPath) relpathCommit() {
@@ -260,13 +261,13 @@ func (rp *ResolvingPath) relpathCommit() {
// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
// resolution should resolve d's parent normally, and CheckRoot returns (false,
// nil).
-func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
+func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) {
if d == rp.root.dentry && rp.mount == rp.root.mount {
// At contextual VFS root (due to e.g. chroot(2)).
return true, nil
} else if d == rp.mount.root {
// At mount root ...
- vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+ vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root)
if vd.Ok() {
// ... of non-root mount.
rp.nextMount = vd.mount
@@ -283,11 +284,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
// to d. If d is a mount point, such that path resolution should switch to
// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
// returns nil.
-func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error {
if !d.isMounted() {
return nil
}
- if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+ if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil {
rp.nextMount = mnt
return resolveMountPointError{}
}
@@ -389,11 +390,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
return resolveMountRootOrJumpError{}
}
-func (rp *ResolvingPath) handleError(err error) bool {
+func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
switch err.(type) {
case resolveMountRootOrJumpError:
// Switch to the new Mount. We hold references on the Mount and Dentry.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.nextMount
rp.start = rp.nextStart
rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
@@ -412,7 +413,7 @@ func (rp *ResolvingPath) handleError(err error) bool {
case resolveMountPointError:
// Switch to the new Mount. We hold a reference on the Mount, but
// borrow the reference on the mount root from the Mount.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.nextMount
rp.start = rp.nextMount.root
rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
@@ -423,12 +424,12 @@ func (rp *ResolvingPath) handleError(err error) bool {
// path.
rp.relpathCommit()
// Restart path resolution on the new Mount.
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
return true
case resolveAbsSymlinkError:
// Switch to the new Mount. References are borrowed from rp.root.
- rp.decRefStartAndMount()
+ rp.decRefStartAndMount(ctx)
rp.mount = rp.root.mount
rp.start = rp.root.dentry
rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
@@ -440,7 +441,7 @@ func (rp *ResolvingPath) handleError(err error) bool {
// path, including the symlink target we just prepended.
rp.relpathCommit()
// Restart path resolution on the new Mount.
- rp.releaseErrorState()
+ rp.releaseErrorState(ctx)
return true
default:
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8d7f8f8af..9c2420683 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,6 +24,9 @@
// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
// VirtualFilesystem.filesystemsMu
// EpollInstance.mu
+// Inotify.mu
+// Watches.mu
+// Inotify.evMu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
@@ -82,6 +85,10 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
+ // lastMountID is the last allocated mount ID. lastMountID is accessed
+ // using atomic memory operations.
+ lastMountID uint64
+
// anonMount is a Mount, not included in mounts or mountpoints,
// representing an anonFilesystem. anonMount is used to back
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -115,7 +122,10 @@ type VirtualFilesystem struct {
}
// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
-func (vfs *VirtualFilesystem) Init() error {
+func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
+ if vfs.mountpoints != nil {
+ panic("VFS already initialized")
+ }
vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
vfs.devices = make(map[devTuple]*registeredDevice)
vfs.anonBlockDevMinorNext = 1
@@ -135,7 +145,7 @@ func (vfs *VirtualFilesystem) Init() error {
devMinor: anonfsDevMinor,
}
anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
- defer anonfs.vfsfs.DecRef()
+ defer anonfs.vfsfs.DecRef(ctx)
anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
if err != nil {
// We should not be passing any MountOptions that would cause
@@ -182,11 +192,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -204,11 +214,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
dentry: d,
}
rp.mount.IncRef()
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return vd, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return VirtualDentry{}, err
}
}
@@ -226,7 +236,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
}
rp.mount.IncRef()
name := rp.Component()
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return parentVD, name, nil
}
if checkInvariants {
@@ -234,8 +244,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return VirtualDentry{}, "", err
}
}
@@ -250,14 +260,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
}
if !newpop.Path.Begin.Ok() {
- oldVD.DecRef()
+ oldVD.DecRef(ctx)
if newpop.Path.Absolute {
return syserror.EEXIST
}
return syserror.ENOENT
}
if newpop.FollowFinalSymlink {
- oldVD.DecRef()
+ oldVD.DecRef(ctx)
ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
return syserror.EINVAL
}
@@ -266,8 +276,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
for {
err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
if err == nil {
- vfs.putResolvingPath(rp)
- oldVD.DecRef()
+ vfs.putResolvingPath(ctx, rp)
+ oldVD.DecRef(ctx)
return nil
}
if checkInvariants {
@@ -275,9 +285,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- oldVD.DecRef()
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
+ oldVD.DecRef(ctx)
return err
}
}
@@ -303,7 +313,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -311,8 +321,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -336,7 +346,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -344,8 +354,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -398,30 +408,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
for {
fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
if opts.FileExec {
- if fd.Mount().flags.NoExec {
- fd.DecRef()
+ if fd.Mount().Flags.NoExec {
+ fd.DecRef(ctx)
return nil, syserror.EACCES
}
// Only a regular file can be executed.
stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
if err != nil {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, err
}
if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
- fd.DecRef()
+ fd.DecRef(ctx)
return nil, syserror.EACCES
}
}
+ fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent)
return fd, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
@@ -433,11 +444,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
for {
target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return target, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return "", err
}
}
@@ -461,19 +472,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
return err
}
if oldName == "." || oldName == ".." {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
return syserror.EBUSY
}
if !newpop.Path.Begin.Ok() {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
if newpop.Path.Absolute {
return syserror.EBUSY
}
return syserror.ENOENT
}
if newpop.FollowFinalSymlink {
- oldParentVD.DecRef()
+ oldParentVD.DecRef(ctx)
ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
return syserror.EINVAL
}
@@ -486,8 +497,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
if err == nil {
- vfs.putResolvingPath(rp)
- oldParentVD.DecRef()
+ vfs.putResolvingPath(ctx, rp)
+ oldParentVD.DecRef(ctx)
return nil
}
if checkInvariants {
@@ -495,9 +506,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- oldParentVD.DecRef()
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
+ oldParentVD.DecRef(ctx)
return err
}
}
@@ -520,7 +531,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
for {
err := rp.mount.fs.impl.RmdirAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -528,8 +539,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -541,11 +552,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -557,11 +568,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
for {
stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return stat, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return linux.Statx{}, err
}
}
@@ -574,11 +585,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
for {
statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return statfs, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return linux.Statfs{}, err
}
}
@@ -601,7 +612,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
for {
err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -609,8 +620,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -633,7 +644,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
for {
err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
if checkInvariants {
@@ -641,8 +652,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -660,7 +671,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
for {
bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return bep, nil
}
if checkInvariants {
@@ -668,8 +679,8 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
@@ -682,7 +693,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
for {
names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return names, nil
}
if err == syserror.ENOTSUP {
@@ -690,11 +701,11 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
// subsystem to return security extended attributes, which by
// default don't exist.
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return nil, err
}
}
@@ -707,11 +718,11 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
for {
val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return val, nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return "", err
}
}
@@ -724,11 +735,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
for {
err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -740,11 +751,11 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre
for {
err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
if err == nil {
- vfs.putResolvingPath(rp)
+ vfs.putResolvingPath(ctx, rp)
return nil
}
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
+ if !rp.handleError(ctx, err) {
+ vfs.putResolvingPath(ctx, rp)
return err
}
}
@@ -766,7 +777,7 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
retErr = err
}
- fs.DecRef()
+ fs.DecRef(ctx)
}
return retErr
}
@@ -820,9 +831,9 @@ func (vd VirtualDentry) IncRef() {
// DecRef decrements the reference counts on the Mount and Dentry represented
// by vd.
-func (vd VirtualDentry) DecRef() {
- vd.dentry.DecRef()
- vd.mount.DecRef()
+func (vd VirtualDentry) DecRef(ctx context.Context) {
+ vd.dentry.DecRef(ctx)
+ vd.mount.DecRef(ctx)
}
// Mount returns the Mount associated with vd. It does not take a reference on
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 101497ed6..748273366 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -77,7 +77,10 @@ var DefaultOpts = Opts{
// trigger it.
const descheduleThreshold = 1 * time.Second
-var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+var (
+ stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+ stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
var stackDumpSameTaskPeriod = time.Minute
@@ -220,6 +223,9 @@ func (w *Watchdog) waitForStart() {
// We are fine.
return
}
+
+ stuckStartup.Increment()
+
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
w.doAction(w.StartupTimeoutAction, false, &buf)
@@ -323,13 +329,13 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
func (w *Watchdog) reportStuckWatchdog() {
var buf bytes.Buffer
- buf.WriteString("Watchdog goroutine is stuck:")
+ buf.WriteString("Watchdog goroutine is stuck")
w.doAction(w.TaskTimeoutAction, false, &buf)
}
// doAction will take the given action. If the action is LogWarning, the stack
-// is not always dumpped to the log to prevent log flooding. "forceStack"
-// guarantees that the stack will be dumped regarless.
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
switch action {
case LogWarning: